#!/usr/bin/env bash AIRFLOW_COMMAND="${1:-}" set -euo pipefail LD_PRELOAD="/usr/lib/$(uname -m)-linux-gnu/libstdc++.so.6" export LD_PRELOAD function run_check_with_retries { local cmd cmd="${1}" local countdown countdown="${CONNECTION_CHECK_MAX_COUNT}" while true do set +e local last_check_result local res last_check_result=$(eval "${cmd} 2>&1") res=$? set -e if [[ ${res} == 0 ]]; then echo break else echo -n "." countdown=$((countdown-1)) fi if [[ ${countdown} == 0 ]]; then echo echo "ERROR! Maximum number of retries (${CONNECTION_CHECK_MAX_COUNT}) reached." echo echo "Last check result:" echo "$ ${cmd}" echo "${last_check_result}" echo exit 1 else sleep "${CONNECTION_CHECK_SLEEP_TIME}" fi done } function run_nc() { # Checks if it is possible to connect to the host using netcat. # # We want to avoid misleading messages and perform only forward lookup of the service IP address. # Netcat when run without -n performs both forward and reverse lookup and fails if the reverse # lookup name does not match the original name even if the host is reachable via IP. This happens # randomly with docker-compose in GitHub Actions. # Since we are not using reverse lookup elsewhere, we can perform forward lookup in python # And use the IP in NC and add '-n' switch to disable any DNS use. # Even if this message might be harmless, it might hide the real reason for the problem # Which is the long time needed to start some services, seeing this message might be totally misleading # when you try to analyse the problem, that's why it's best to avoid it, local host="${1}" local port="${2}" local ip ip=$(python -c "import socket; print(socket.gethostbyname('${host}'))") nc -zvvn "${ip}" "${port}" } function wait_for_connection { # Waits for Connection to the backend specified via URL passed as first parameter # Detects backend type depending on the URL schema and assigns # default port numbers if not specified in the URL. # Then it loops until connection to the host/port specified can be established # It tries `CONNECTION_CHECK_MAX_COUNT` times and sleeps `CONNECTION_CHECK_SLEEP_TIME` between checks local connection_url connection_url="${1}" local detected_backend detected_backend=$(python -c "from urllib.parse import urlsplit; import sys; print(urlsplit(sys.argv[1]).scheme)" "${connection_url}") local detected_host detected_host=$(python -c "from urllib.parse import urlsplit; import sys; print(urlsplit(sys.argv[1]).hostname or '')" "${connection_url}") local detected_port detected_port=$(python -c "from urllib.parse import urlsplit; import sys; print(urlsplit(sys.argv[1]).port or '')" "${connection_url}") echo BACKEND="${BACKEND:=${detected_backend}}" readonly BACKEND if [[ -z "${detected_port=}" ]]; then if [[ ${BACKEND} == "postgres"* ]]; then detected_port=5432 elif [[ ${BACKEND} == "mysql"* ]]; then detected_port=3306 elif [[ ${BACKEND} == "mssql"* ]]; then detected_port=1433 elif [[ ${BACKEND} == "redis"* ]]; then detected_port=6379 elif [[ ${BACKEND} == "amqp"* ]]; then detected_port=5672 fi fi detected_host=${detected_host:="localhost"} # Allow the DB parameters to be overridden by environment variable echo DB_HOST="${DB_HOST:=${detected_host}}" readonly DB_HOST echo DB_PORT="${DB_PORT:=${detected_port}}" readonly DB_PORT if [[ -n "${DB_HOST=}" ]] && [[ -n "${DB_PORT=}" ]]; then run_check_with_retries "run_nc ${DB_HOST@Q} ${DB_PORT@Q}" else >&2 echo "The connection details to the broker could not be determined. Connectivity checks were skipped." fi } function create_www_user() { local local_password="" # Warning: command environment variables (*_CMD) have priority over usual configuration variables # for configuration parameters that require sensitive information. This is the case for the SQL database # and the broker backend in this entrypoint script. if [[ -n "${_AIRFLOW_WWW_USER_PASSWORD_CMD=}" ]]; then local_password=$(eval "${_AIRFLOW_WWW_USER_PASSWORD_CMD}") unset _AIRFLOW_WWW_USER_PASSWORD_CMD elif [[ -n "${_AIRFLOW_WWW_USER_PASSWORD=}" ]]; then local_password="${_AIRFLOW_WWW_USER_PASSWORD}" unset _AIRFLOW_WWW_USER_PASSWORD fi if [[ -z ${local_password} ]]; then echo echo "ERROR! Airflow Admin password not set via _AIRFLOW_WWW_USER_PASSWORD or _AIRFLOW_WWW_USER_PASSWORD_CMD variables!" echo exit 1 fi airflow users create \ --username "${_AIRFLOW_WWW_USER_USERNAME="admin"}" \ --firstname "${_AIRFLOW_WWW_USER_FIRSTNAME="Airflow"}" \ --lastname "${_AIRFLOW_WWW_USER_LASTNAME="Admin"}" \ --email "${_AIRFLOW_WWW_USER_EMAIL="airflowadmin@example.com"}" \ --role "${_AIRFLOW_WWW_USER_ROLE="Admin"}" \ --password "${local_password}" || true } function create_system_user_if_missing() { # This is needed in case of OpenShift-compatible container execution. In case of OpenShift random # User id is used when starting the image, however group 0 is kept as the user group. Our production # Image is OpenShift compatible, so all permissions on all folders are set so that 0 group can exercise # the same privileges as the default "airflow" user, this code checks if the user is already # present in /etc/passwd and will create the system user dynamically, including setting its # HOME directory to the /home/airflow so that (for example) the ${HOME}/.local folder where airflow is # Installed can be automatically added to PYTHONPATH if ! whoami &> /dev/null; then if [[ -w /etc/passwd ]]; then echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${AIRFLOW_USER_HOME_DIR}:/sbin/nologin" \ >> /etc/passwd fi export HOME="${AIRFLOW_USER_HOME_DIR}" fi } function set_pythonpath_for_root_user() { # Airflow is installed as a local user application which means that if the container is running as root # the application is not available. because Python then only load system-wide applications. # Now also adds applications installed as local user "airflow". if [[ $UID == "0" ]]; then local python_major_minor python_major_minor="$(python --version | cut -d " " -f 2 | cut -d "." -f 1-2)" export PYTHONPATH="${AIRFLOW_USER_HOME_DIR}/.local/lib/python${python_major_minor}/site-packages:${PYTHONPATH:-}" >&2 echo "The container is run as root user. For security, consider using a regular user account." fi } function wait_for_airflow_db() { # Wait for the command to run successfully to validate the database connection. run_check_with_retries "airflow db check" } function upgrade_db() { # Runs airflow db upgrade airflow db upgrade || true } function wait_for_celery_broker() { # Verifies connection to Celery Broker local executor executor="$(airflow config get-value core executor)" if [[ "${executor}" == "CeleryExecutor" ]]; then local connection_url connection_url="$(airflow config get-value celery broker_url)" wait_for_connection "${connection_url}" fi } function exec_to_bash_or_python_command_if_specified() { # If one of the commands: 'bash', 'python' is used, either run appropriate # command with exec if [[ ${AIRFLOW_COMMAND} == "bash" ]]; then shift exec "/bin/bash" "${@}" elif [[ ${AIRFLOW_COMMAND} == "python" ]]; then shift exec "python" "${@}" fi } function check_uid_gid() { if [[ $(id -g) == "0" ]]; then return fi if [[ $(id -u) == "50000" ]]; then >&2 echo >&2 echo "WARNING! You should run the image with GID (Group ID) set to 0" >&2 echo " even if you use 'airflow' user (UID=50000)" >&2 echo >&2 echo " You started the image with UID=$(id -u) and GID=$(id -g)" >&2 echo >&2 echo " This is to make sure you can run the image with an arbitrary UID in the future." >&2 echo >&2 echo " See more about it in the Airflow's docker image documentation" >&2 echo " http://airflow.apache.org/docs/docker-stack/entrypoint" >&2 echo # We still allow the image to run with `airflow` user. return else >&2 echo >&2 echo "ERROR! You should run the image with GID=0" >&2 echo >&2 echo " You started the image with UID=$(id -u) and GID=$(id -g)" >&2 echo >&2 echo "The image should always be run with GID (Group ID) set to 0 regardless of the UID used." >&2 echo " This is to make sure you can run the image with an arbitrary UID." >&2 echo >&2 echo " See more about it in the Airflow's docker image documentation" >&2 echo " http://airflow.apache.org/docs/docker-stack/entrypoint" # This will not work so we fail hard exit 1 fi } unset PIP_USER check_uid_gid umask 0002 CONNECTION_CHECK_MAX_COUNT=${CONNECTION_CHECK_MAX_COUNT:=20} readonly CONNECTION_CHECK_MAX_COUNT CONNECTION_CHECK_SLEEP_TIME=${CONNECTION_CHECK_SLEEP_TIME:=3} readonly CONNECTION_CHECK_SLEEP_TIME create_system_user_if_missing set_pythonpath_for_root_user if [[ "${CONNECTION_CHECK_MAX_COUNT}" -gt "0" ]]; then wait_for_airflow_db fi if [[ -n "${_AIRFLOW_DB_UPGRADE=}" ]] ; then upgrade_db fi if [[ -n "${_AIRFLOW_WWW_USER_CREATE=}" ]] ; then create_www_user fi if [[ -n "${_PIP_ADDITIONAL_REQUIREMENTS=}" ]] ; then >&2 echo >&2 echo "!!!!! Installing additional requirements: '${_PIP_ADDITIONAL_REQUIREMENTS}' !!!!!!!!!!!!" >&2 echo >&2 echo "WARNING: This is a development/test feature only. NEVER use it in production!" >&2 echo " Instead, build a custom image as described in" >&2 echo >&2 echo " https://airflow.apache.org/docs/docker-stack/build.html" >&2 echo >&2 echo " Adding requirements at container startup is fragile and is done every time" >&2 echo " the container starts, so it is onlny useful for testing and trying out" >&2 echo " of adding dependencies." >&2 echo pip install --root-user-action ignore --no-cache-dir ${_PIP_ADDITIONAL_REQUIREMENTS} fi exec_to_bash_or_python_command_if_specified "${@}" if [[ ${AIRFLOW_COMMAND} == "airflow" ]]; then AIRFLOW_COMMAND="${2:-}" shift fi if [[ ${AIRFLOW_COMMAND} =~ ^(scheduler|celery)$ ]] \ && [[ "${CONNECTION_CHECK_MAX_COUNT}" -gt "0" ]]; then wait_for_celery_broker fi exec "airflow" "${@}"