diff --git a/containers/stats/Dockerfile b/containers/stats/Dockerfile index e8306e7..664e837 100644 --- a/containers/stats/Dockerfile +++ b/containers/stats/Dockerfile @@ -19,8 +19,11 @@ ENV LANGUAGE en_US:en ENV LC_ALL en_US.UTF-8 COPY *.sh /usr/local/bin/ +# requirements.txt generated by running `pipenv lock -r > ../../containers/stats/requirements.txt` in `scripts/metrics`. +COPY requirements.txt /tmp/ +RUN pip install -q -r /tmp/requirements.txt RUN chmod og+rx /usr/local/bin/*.sh -RUN wget https://dl.google.com/cloudsql/cloud_sql_proxy.linux.amd64 -O /usr/local/bin/cloud_sql_proxy;\ +RUN wget -q https://dl.google.com/cloudsql/cloud_sql_proxy.linux.amd64 -O /usr/local/bin/cloud_sql_proxy;\ chmod +x /usr/local/bin/cloud_sql_proxy ENTRYPOINT ["entrypoint.sh"] diff --git a/containers/stats/entrypoint.sh b/containers/stats/entrypoint.sh index 442dbe6..4af31bb 100755 --- a/containers/stats/entrypoint.sh +++ b/containers/stats/entrypoint.sh @@ -17,4 +17,6 @@ set -euo pipefail git clone --depth 1 https://github.com/google/llvm-premerge-checks.git ~/llvm-premerge-checks cd ~/llvm-premerge-checks +git fetch origin "${SCRIPTS_REFSPEC:=main}":x +git checkout x exec /usr/bin/tini -g -- $@ diff --git a/containers/stats/requirements.txt b/containers/stats/requirements.txt new file mode 100644 index 0000000..f63f3d1 --- /dev/null +++ b/containers/stats/requirements.txt @@ -0,0 +1,27 @@ +backoff==1.10.0 +certifi==2020.12.5 +chardet==4.0.0 +ftfy==6.0.1; python_version >= '3.6' +gitdb==4.0.7 +gitpython==3.1.17 +idna==2.10 +lxml==4.6.3 +mailchecker==4.0.7 +pathspec==0.8.1 +phabricator==0.8.1 +phonenumbers==8.12.23 +psycopg2-binary==2.8.6 +pyaml==20.4.0 +python-benedict==0.24.0 +python-dateutil==2.8.1 +python-fsutil==0.5.0 +python-slugify==5.0.2 +pyyaml==5.4.1 +requests==2.25.1 +six==1.16.0 +smmap==4.0.0 +text-unidecode==1.3 +toml==0.10.2 +urllib3==1.26.4 +wcwidth==0.2.5 +xmltodict==0.12.0 diff --git a/kubernetes/cron/buildbots.yaml b/kubernetes/cron/buildbots.yaml new file mode 100644 index 0000000..d2b8124 --- /dev/null +++ b/kubernetes/cron/buildbots.yaml @@ -0,0 +1,63 @@ +# Copyright 2021 Google LLC +# +# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://llvm.org/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +apiVersion: batch/v1beta1 +kind: CronJob +metadata: + name: buildbot-stats + namespace: buildkite +spec: + schedule: "40 * * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 24 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + template: + spec: + containers: + - name: collect-buildkite-stats + image: gcr.io/llvm-premerge-checks/stats:latest + args: ["/root/llvm-premerge-checks/scripts/metrics/connect_db.sh python3 buildbot_monitoring.py"] + env: + - name: BUILDKITE_AGENT_TOKEN + valueFrom: + secretKeyRef: + name: buildkite-agent-token + key: token + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: CONDUIT_TOKEN + valueFrom: + secretKeyRef: + name: conduit-api-token + key: token + - name: BUILDKITE_API_TOKEN + valueFrom: + secretKeyRef: + name: buildkite-api-token + key: token + - name: DB_PASSWORD + valueFrom: + secretKeyRef: + name: db-stats + key: password + - name: SCRIPTS_REFSPEC + value: "cron-2" + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-nodepool: service \ No newline at end of file diff --git a/kubernetes/stats-cron.yaml b/kubernetes/cron/buildkite.yaml similarity index 90% rename from kubernetes/stats-cron.yaml rename to kubernetes/cron/buildkite.yaml index 711fd81..5b2f4c6 100644 --- a/kubernetes/stats-cron.yaml +++ b/kubernetes/cron/buildkite.yaml @@ -16,7 +16,7 @@ apiVersion: batch/v1beta1 kind: CronJob metadata: - name: collect-stats + name: buildkite-stats namespace: buildkite spec: schedule: "0 * * * *" @@ -30,7 +30,7 @@ spec: containers: - name: collect-buildkite-stats image: gcr.io/llvm-premerge-checks/stats:latest - args: ["/root/llvm-premerge-checks/scripts/metrics/load_buildkite.sh"] + args: ["/root/llvm-premerge-checks/scripts/metrics/connect_db.sh python3 load_buildkite.py"] env: - name: BUILDKITE_AGENT_TOKEN valueFrom: @@ -56,6 +56,8 @@ spec: secretKeyRef: name: db-stats key: password + - name: SCRIPTS_REFSPEC + value: "cron-2" restartPolicy: Never nodeSelector: cloud.google.com/gke-nodepool: service \ No newline at end of file diff --git a/kubernetes/cron/uptime.yaml b/kubernetes/cron/uptime.yaml new file mode 100644 index 0000000..af37ce8 --- /dev/null +++ b/kubernetes/cron/uptime.yaml @@ -0,0 +1,63 @@ +# Copyright 2021 Google LLC +# +# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://llvm.org/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +apiVersion: batch/v1beta1 +kind: CronJob +metadata: + name: uptime-stats + namespace: buildkite +spec: + schedule: "20 * * * *" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 24 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + template: + spec: + containers: + - name: collect-buildkite-stats + image: gcr.io/llvm-premerge-checks/stats:latest + args: ["/root/llvm-premerge-checks/scripts/metrics/connect_db.sh python3 server_monitoring.py"] + env: + - name: BUILDKITE_AGENT_TOKEN + valueFrom: + secretKeyRef: + name: buildkite-agent-token + key: token + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: CONDUIT_TOKEN + valueFrom: + secretKeyRef: + name: conduit-api-token + key: token + - name: BUILDKITE_API_TOKEN + valueFrom: + secretKeyRef: + name: buildkite-api-token + key: token + - name: DB_PASSWORD + valueFrom: + secretKeyRef: + name: db-stats + key: password + - name: SCRIPTS_REFSPEC + value: "cron-2" + restartPolicy: Never + nodeSelector: + cloud.google.com/gke-nodepool: service \ No newline at end of file diff --git a/scripts/metrics/buildbot_monitoring.py b/scripts/metrics/buildbot_monitoring.py index 8d781c7..cc3a6c6 100755 --- a/scripts/metrics/buildbot_monitoring.py +++ b/scripts/metrics/buildbot_monitoring.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 - +import logging import psycopg2 import os import datetime @@ -10,6 +10,7 @@ import json PHABRICATOR_URL = "https://reviews.llvm.org/api/" BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2/" + # TODO(kuhnel): retry on connection issues, maybe resuse # https://github.com/google/llvm-premerge-checks/blob/main/scripts/phabtalk/phabtalk.py#L44 @@ -18,12 +19,9 @@ BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2/" def connect_to_db() -> psycopg2.extensions.connection: - """Connect to the database, create tables as needed.""" + """Connect to the database.""" conn = psycopg2.connect( - "host=127.0.0.1 sslmode=disable dbname=stats user={} password={}".format( - os.environ["PGUSER"], os.environ["PGPASSWORD"] - ) - ) + f"host=127.0.0.1 sslmode=disable dbname=buildbots user=stats password={os.getenv('DB_PASSWORD')}") return conn @@ -81,7 +79,7 @@ def create_tables(conn: psycopg2.extensions.connection): def get_worker_status( - worker_id: int, conn: psycopg2.extensions.connection + worker_id: int, conn: psycopg2.extensions.connection ) -> Optional[Dict]: """Note: postgres returns a dict for a stored json object.""" cur = conn.cursor() @@ -96,7 +94,7 @@ def get_worker_status( def get_builder_status( - builder_id: int, conn: psycopg2.extensions.connection + builder_id: int, conn: psycopg2.extensions.connection ) -> Optional[Dict]: """Note: postgres returns a dict for a stored json object.""" cur = conn.cursor() @@ -112,10 +110,10 @@ def get_builder_status( def set_worker_status( - timestamp: datetime.datetime, - worker_id: int, - data: str, - conn: psycopg2.extensions.connection, + timestamp: datetime.datetime, + worker_id: int, + data: str, + conn: psycopg2.extensions.connection, ): cur = conn.cursor() cur.execute( @@ -126,7 +124,7 @@ def set_worker_status( def update_workers(conn: psycopg2.extensions.connection): - print("Updating worker status...") + logging.info("Updating worker status...") response = requests.get(BUILDBOT_URL + "workers") timestamp = datetime.datetime.now() for worker in response.json()["workers"]: @@ -144,7 +142,7 @@ def update_workers(conn: psycopg2.extensions.connection): def update_builders(conn: psycopg2.extensions.connection): """get list of all builder ids.""" - print("Updating builder status...") + logging.info("Updating builder status...") response = requests.get(BUILDBOT_URL + "builders") timestamp = datetime.datetime.now() for builder in response.json()["builders"]: @@ -174,13 +172,10 @@ def get_last_build(conn: psycopg2.extensions.connection) -> int: def update_build_status(conn: psycopg2.extensions.connection): start_id = get_last_build(conn) - print("Updating build results, starting with {}...".format(start_id)) + logging.info("Updating build results, starting with {}...".format(start_id)) url = BUILDBOT_URL + "builds" cur = conn.cursor() - - for result_set in rest_request_iterator( - url, "builds", "buildid", start_id=start_id - ): + for result_set in rest_request_iterator(url, "builds", "buildid", start_id=start_id): args_str = b",".join( cur.mogrify( b" (%s,%s,%s,%s) ", @@ -194,21 +189,20 @@ def update_build_status(conn: psycopg2.extensions.connection): for build in result_set if build["complete"] ) - cur.execute( b"INSERT INTO buildbot_builds (build_id, builder_id, build_number, build_data) values " + args_str ) - print(" {}".format(result_set[-1]["buildid"])) + logging.info("last build id: {}".format(result_set[-1]["buildid"])) conn.commit() def rest_request_iterator( - url: str, - array_field_name: str, - id_field_name: str, - start_id: int = 0, - step: int = 1000, + url: str, + array_field_name: str, + id_field_name: str, + start_id: int = 0, + step: int = 1000, ): """Request paginated data from the buildbot master. @@ -252,12 +246,12 @@ def get_latest_buildset(conn: psycopg2.extensions.connection) -> int: def update_buildsets(conn: psycopg2.extensions.connection): start_id = get_latest_buildset(conn) - print("Getting buildsets, starting with {}...".format(start_id)) + logging.info("Getting buildsets, starting with {}...".format(start_id)) url = BUILDBOT_URL + "buildsets" cur = conn.cursor() for result_set in rest_request_iterator( - url, "buildsets", "bsid", start_id=start_id + url, "buildsets", "bsid", start_id=start_id ): args_str = b",".join( cur.mogrify( @@ -273,7 +267,7 @@ def update_buildsets(conn: psycopg2.extensions.connection): cur.execute( b"INSERT INTO buildbot_buildsets (buildset_id, data) values " + args_str ) - print(" {}".format(result_set[-1]["bsid"])) + logging.info("last id {}".format(result_set[-1]["bsid"])) conn.commit() @@ -288,11 +282,11 @@ def get_latest_buildrequest(conn: psycopg2.extensions.connection) -> int: def update_buildrequests(conn: psycopg2.extensions.connection): start_id = get_latest_buildrequest(conn) - print("Getting buildrequests, starting with {}...".format(start_id)) + logging.info("Getting buildrequests, starting with {}...".format(start_id)) url = BUILDBOT_URL + "buildrequests" cur = conn.cursor() for result_set in rest_request_iterator( - url, "buildrequests", "buildrequestid", start_id=start_id + url, "buildrequests", "buildrequestid", start_id=start_id ): # cur.mogrify returns a byte string, so we need to join on a byte string args_str = b",".join( @@ -313,12 +307,12 @@ def update_buildrequests(conn: psycopg2.extensions.connection): b"INSERT INTO buildbot_buildrequests (buildrequest_id, buildset_id, data) values " + args_str ) - print(" {}".format(result_set[-1]["buildrequestid"])) + logging.info("{}".format(result_set[-1]["buildrequestid"])) conn.commit() -def buildbot_monitoring(): - """Main function of monitoring the phabricator server.""" +if __name__ == "__main__": + logging.basicConfig(level='INFO', format='%(levelname)-7s %(message)s') conn = connect_to_db() create_tables(conn) update_workers(conn) @@ -326,8 +320,3 @@ def buildbot_monitoring(): update_build_status(conn) update_buildsets(conn) update_buildrequests(conn) - print("Completed, exiting...") - - -if __name__ == "__main__": - buildbot_monitoring() diff --git a/scripts/metrics/connect_db.sh b/scripts/metrics/connect_db.sh new file mode 100755 index 0000000..0b9cfa4 --- /dev/null +++ b/scripts/metrics/connect_db.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +# Copyright 2021 Google LLC +# +# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://llvm.org/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# generate statistics on the llvm github repository +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +cd $SCRIPT_DIR +# sleep to let proxy to came up +cloud_sql_proxy -instances=llvm-premerge-checks:us-central1:buildkite-stats=tcp:0.0.0.0:5432 & +sleep 3s +$@ diff --git a/scripts/metrics/load_buildkite.py b/scripts/metrics/load_buildkite.py index 1515967..6d6e67b 100644 --- a/scripts/metrics/load_buildkite.py +++ b/scripts/metrics/load_buildkite.py @@ -1,5 +1,4 @@ import sys - import psycopg2 import psycopg2.extras import logging @@ -14,6 +13,7 @@ psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json) token = f'Bearer {os.getenv("BUILDKITE_API_TOKEN")}' + def connect(): return psycopg2.connect( f"host=127.0.0.1 sslmode=disable dbname=stats user=stats password={os.getenv('DB_PASSWORD')}") @@ -57,7 +57,8 @@ where a.id IS NULL and j.raw->>'raw_log_url' IS NOT NULL content, en = download_text(url) meta['encoding'] = en with conn.cursor() as i: - i.execute('INSERT INTO artifacts (id, job_id, content, meta) VALUES (%s, %s, %s, %s)', [job_id, job_id, content, meta]) + i.execute('INSERT INTO artifacts (id, job_id, content, meta) VALUES (%s, %s, %s, %s)', + [job_id, job_id, content, meta]) except: meta['failure'] = traceback.format_exc() logging.error(f'download artifact failed {meta["failure"]} {url}') @@ -247,7 +248,6 @@ where j.id IS NULL""") if __name__ == '__main__': logging.basicConfig(level='INFO', format='%(levelname)-7s %(message)s') - print(os.environ) cn = connect() logging.info('downloading buildkite data') insert_new_builds(cn) diff --git a/scripts/metrics/load_buildkite.sh b/scripts/metrics/load_buildkite.sh deleted file mode 100755 index 8621cf7..0000000 --- a/scripts/metrics/load_buildkite.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash -echo "loading buildkite data" -SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -cd $SCRIPT_DIR -pipenv install -cloud_sql_proxy -instances=llvm-premerge-checks:us-central1:buildkite-stats=tcp:0.0.0.0:5432 & pipenv run python3 $SCRIPT_DIR/load_buildkite.py diff --git a/scripts/metrics/requirements.txt b/scripts/metrics/requirements.txt deleted file mode 100644 index 96d29ac..0000000 --- a/scripts/metrics/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -# these files are needed in addition to /scripts/requirments.txt -psycopg2 \ No newline at end of file diff --git a/scripts/metrics/server_monitoring.py b/scripts/metrics/server_monitoring.py index 064f2ad..9eba3fe 100755 --- a/scripts/metrics/server_monitoring.py +++ b/scripts/metrics/server_monitoring.py @@ -1,14 +1,15 @@ #!/usr/bin/env python3 - +import traceback import psycopg2 from phabricator import Phabricator import os from typing import Optional import datetime import requests +import logging PHABRICATOR_URL = "https://reviews.llvm.org/api/" -BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2/" +BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2" def phab_up() -> Optional[Phabricator]: @@ -16,36 +17,37 @@ def phab_up() -> Optional[Phabricator]: Returns None if server is down. """ - print("Checking Phabricator status...") + logging.info("Checking Phabricator status...") try: - phab = Phabricator(host=PHABRICATOR_URL) + phab = Phabricator(token=os.getenv('CONDUIT_TOKEN'), host=PHABRICATOR_URL) phab.update_interfaces() - print(" Phabricator is up.") + logging.info("Phabricator is up.") return phab - except Exception: - pass - print(" Phabricator is down.") + except Exception as ex: + logging.error(ex) + logging.error(traceback.format_exc()) + logging.warning("Phabricator is down.") return None def buildbot_up() -> bool: """Check if buildbot server is up""" - print("Checking Buildbot status...") + logging.info("Checking Buildbot status...") try: - response = requests.get(BUILDBOT_URL + "buildrequests?limit=100") - if "masters" in response.json(): - print(" Buildbot is up.") - return True - except Exception: - pass - print(" Buildbot is down.") + response = requests.get(BUILDBOT_URL) + logging.info(f'{response.status_code} {BUILDBOT_URL}') + logging.info(response.content) + return response.status_code == 200 + except Exception as ex: + logging.error(ex) + logging.error(traceback.format_exc()) + logging.warning("Buildbot is down.") return False def log_server_status(phab: bool, buildbot: bool, conn: psycopg2.extensions.connection): """log the phabricator status to the database.""" - print("Writing Phabricator status to database...") - + logging.info("Writing Phabricator status to database...") cur = conn.cursor() cur.execute( "INSERT INTO server_status (timestamp, phabricator, buildbot) VALUES (%s,%s,%s);", @@ -57,10 +59,7 @@ def log_server_status(phab: bool, buildbot: bool, conn: psycopg2.extensions.conn def connect_to_db() -> psycopg2.extensions.connection: """Connect to the database, create tables as needed.""" conn = psycopg2.connect( - "host=127.0.0.1 sslmode=disable dbname=stats user={} password={}".format( - os.environ["PGUSER"], os.environ["PGPASSWORD"] - ) - ) + f"host=127.0.0.1 sslmode=disable dbname=phabricator user=stats password={os.getenv('DB_PASSWORD')}") cur = conn.cursor() cur.execute( "CREATE TABLE IF NOT EXISTS server_status (timestamp timestamp, phabricator boolean, buildbot boolean);" @@ -69,14 +68,9 @@ def connect_to_db() -> psycopg2.extensions.connection: return conn -def server_monitoring(): - """Main function of monitoring the servers.""" +if __name__ == "__main__": + logging.basicConfig(level='INFO', format='%(levelname)-7s %(message)s') conn = connect_to_db() phab = phab_up() buildbot = buildbot_up() log_server_status(phab is not None, buildbot, conn) - print("Completed, exiting...") - - -if __name__ == "__main__": - server_monitoring()