cron jobs for buildbot and phab monitoring
+ fix phabricator / buildbot uptime monitoring + data is stored in separate databases
This commit is contained in:
parent
46b7b8d8b7
commit
a44473098c
12 changed files with 239 additions and 82 deletions
|
@ -19,8 +19,11 @@ ENV LANGUAGE en_US:en
|
|||
ENV LC_ALL en_US.UTF-8
|
||||
|
||||
COPY *.sh /usr/local/bin/
|
||||
# requirements.txt generated by running `pipenv lock -r > ../../containers/stats/requirements.txt` in `scripts/metrics`.
|
||||
COPY requirements.txt /tmp/
|
||||
RUN pip install -q -r /tmp/requirements.txt
|
||||
RUN chmod og+rx /usr/local/bin/*.sh
|
||||
RUN wget https://dl.google.com/cloudsql/cloud_sql_proxy.linux.amd64 -O /usr/local/bin/cloud_sql_proxy;\
|
||||
RUN wget -q https://dl.google.com/cloudsql/cloud_sql_proxy.linux.amd64 -O /usr/local/bin/cloud_sql_proxy;\
|
||||
chmod +x /usr/local/bin/cloud_sql_proxy
|
||||
|
||||
ENTRYPOINT ["entrypoint.sh"]
|
||||
|
|
|
@ -17,4 +17,6 @@ set -euo pipefail
|
|||
|
||||
git clone --depth 1 https://github.com/google/llvm-premerge-checks.git ~/llvm-premerge-checks
|
||||
cd ~/llvm-premerge-checks
|
||||
git fetch origin "${SCRIPTS_REFSPEC:=main}":x
|
||||
git checkout x
|
||||
exec /usr/bin/tini -g -- $@
|
||||
|
|
27
containers/stats/requirements.txt
Normal file
27
containers/stats/requirements.txt
Normal file
|
@ -0,0 +1,27 @@
|
|||
backoff==1.10.0
|
||||
certifi==2020.12.5
|
||||
chardet==4.0.0
|
||||
ftfy==6.0.1; python_version >= '3.6'
|
||||
gitdb==4.0.7
|
||||
gitpython==3.1.17
|
||||
idna==2.10
|
||||
lxml==4.6.3
|
||||
mailchecker==4.0.7
|
||||
pathspec==0.8.1
|
||||
phabricator==0.8.1
|
||||
phonenumbers==8.12.23
|
||||
psycopg2-binary==2.8.6
|
||||
pyaml==20.4.0
|
||||
python-benedict==0.24.0
|
||||
python-dateutil==2.8.1
|
||||
python-fsutil==0.5.0
|
||||
python-slugify==5.0.2
|
||||
pyyaml==5.4.1
|
||||
requests==2.25.1
|
||||
six==1.16.0
|
||||
smmap==4.0.0
|
||||
text-unidecode==1.3
|
||||
toml==0.10.2
|
||||
urllib3==1.26.4
|
||||
wcwidth==0.2.5
|
||||
xmltodict==0.12.0
|
63
kubernetes/cron/buildbots.yaml
Normal file
63
kubernetes/cron/buildbots.yaml
Normal file
|
@ -0,0 +1,63 @@
|
|||
# Copyright 2021 Google LLC
|
||||
#
|
||||
# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://llvm.org/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
apiVersion: batch/v1beta1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: buildbot-stats
|
||||
namespace: buildkite
|
||||
spec:
|
||||
schedule: "40 * * * *"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 24
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: collect-buildkite-stats
|
||||
image: gcr.io/llvm-premerge-checks/stats:latest
|
||||
args: ["/root/llvm-premerge-checks/scripts/metrics/connect_db.sh python3 buildbot_monitoring.py"]
|
||||
env:
|
||||
- name: BUILDKITE_AGENT_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: buildkite-agent-token
|
||||
key: token
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: CONDUIT_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: conduit-api-token
|
||||
key: token
|
||||
- name: BUILDKITE_API_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: buildkite-api-token
|
||||
key: token
|
||||
- name: DB_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: db-stats
|
||||
key: password
|
||||
- name: SCRIPTS_REFSPEC
|
||||
value: "cron-2"
|
||||
restartPolicy: Never
|
||||
nodeSelector:
|
||||
cloud.google.com/gke-nodepool: service
|
|
@ -16,7 +16,7 @@
|
|||
apiVersion: batch/v1beta1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: collect-stats
|
||||
name: buildkite-stats
|
||||
namespace: buildkite
|
||||
spec:
|
||||
schedule: "0 * * * *"
|
||||
|
@ -30,7 +30,7 @@ spec:
|
|||
containers:
|
||||
- name: collect-buildkite-stats
|
||||
image: gcr.io/llvm-premerge-checks/stats:latest
|
||||
args: ["/root/llvm-premerge-checks/scripts/metrics/load_buildkite.sh"]
|
||||
args: ["/root/llvm-premerge-checks/scripts/metrics/connect_db.sh python3 load_buildkite.py"]
|
||||
env:
|
||||
- name: BUILDKITE_AGENT_TOKEN
|
||||
valueFrom:
|
||||
|
@ -56,6 +56,8 @@ spec:
|
|||
secretKeyRef:
|
||||
name: db-stats
|
||||
key: password
|
||||
- name: SCRIPTS_REFSPEC
|
||||
value: "cron-2"
|
||||
restartPolicy: Never
|
||||
nodeSelector:
|
||||
cloud.google.com/gke-nodepool: service
|
63
kubernetes/cron/uptime.yaml
Normal file
63
kubernetes/cron/uptime.yaml
Normal file
|
@ -0,0 +1,63 @@
|
|||
# Copyright 2021 Google LLC
|
||||
#
|
||||
# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://llvm.org/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
apiVersion: batch/v1beta1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: uptime-stats
|
||||
namespace: buildkite
|
||||
spec:
|
||||
schedule: "20 * * * *"
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 24
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: collect-buildkite-stats
|
||||
image: gcr.io/llvm-premerge-checks/stats:latest
|
||||
args: ["/root/llvm-premerge-checks/scripts/metrics/connect_db.sh python3 server_monitoring.py"]
|
||||
env:
|
||||
- name: BUILDKITE_AGENT_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: buildkite-agent-token
|
||||
key: token
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: CONDUIT_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: conduit-api-token
|
||||
key: token
|
||||
- name: BUILDKITE_API_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: buildkite-api-token
|
||||
key: token
|
||||
- name: DB_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: db-stats
|
||||
key: password
|
||||
- name: SCRIPTS_REFSPEC
|
||||
value: "cron-2"
|
||||
restartPolicy: Never
|
||||
nodeSelector:
|
||||
cloud.google.com/gke-nodepool: service
|
|
@ -1,5 +1,5 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import logging
|
||||
import psycopg2
|
||||
import os
|
||||
import datetime
|
||||
|
@ -10,6 +10,7 @@ import json
|
|||
PHABRICATOR_URL = "https://reviews.llvm.org/api/"
|
||||
BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2/"
|
||||
|
||||
|
||||
# TODO(kuhnel): retry on connection issues, maybe resuse
|
||||
# https://github.com/google/llvm-premerge-checks/blob/main/scripts/phabtalk/phabtalk.py#L44
|
||||
|
||||
|
@ -18,12 +19,9 @@ BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2/"
|
|||
|
||||
|
||||
def connect_to_db() -> psycopg2.extensions.connection:
|
||||
"""Connect to the database, create tables as needed."""
|
||||
"""Connect to the database."""
|
||||
conn = psycopg2.connect(
|
||||
"host=127.0.0.1 sslmode=disable dbname=stats user={} password={}".format(
|
||||
os.environ["PGUSER"], os.environ["PGPASSWORD"]
|
||||
)
|
||||
)
|
||||
f"host=127.0.0.1 sslmode=disable dbname=buildbots user=stats password={os.getenv('DB_PASSWORD')}")
|
||||
return conn
|
||||
|
||||
|
||||
|
@ -126,7 +124,7 @@ def set_worker_status(
|
|||
|
||||
|
||||
def update_workers(conn: psycopg2.extensions.connection):
|
||||
print("Updating worker status...")
|
||||
logging.info("Updating worker status...")
|
||||
response = requests.get(BUILDBOT_URL + "workers")
|
||||
timestamp = datetime.datetime.now()
|
||||
for worker in response.json()["workers"]:
|
||||
|
@ -144,7 +142,7 @@ def update_workers(conn: psycopg2.extensions.connection):
|
|||
|
||||
def update_builders(conn: psycopg2.extensions.connection):
|
||||
"""get list of all builder ids."""
|
||||
print("Updating builder status...")
|
||||
logging.info("Updating builder status...")
|
||||
response = requests.get(BUILDBOT_URL + "builders")
|
||||
timestamp = datetime.datetime.now()
|
||||
for builder in response.json()["builders"]:
|
||||
|
@ -174,13 +172,10 @@ def get_last_build(conn: psycopg2.extensions.connection) -> int:
|
|||
|
||||
def update_build_status(conn: psycopg2.extensions.connection):
|
||||
start_id = get_last_build(conn)
|
||||
print("Updating build results, starting with {}...".format(start_id))
|
||||
logging.info("Updating build results, starting with {}...".format(start_id))
|
||||
url = BUILDBOT_URL + "builds"
|
||||
cur = conn.cursor()
|
||||
|
||||
for result_set in rest_request_iterator(
|
||||
url, "builds", "buildid", start_id=start_id
|
||||
):
|
||||
for result_set in rest_request_iterator(url, "builds", "buildid", start_id=start_id):
|
||||
args_str = b",".join(
|
||||
cur.mogrify(
|
||||
b" (%s,%s,%s,%s) ",
|
||||
|
@ -194,12 +189,11 @@ def update_build_status(conn: psycopg2.extensions.connection):
|
|||
for build in result_set
|
||||
if build["complete"]
|
||||
)
|
||||
|
||||
cur.execute(
|
||||
b"INSERT INTO buildbot_builds (build_id, builder_id, build_number, build_data) values "
|
||||
+ args_str
|
||||
)
|
||||
print(" {}".format(result_set[-1]["buildid"]))
|
||||
logging.info("last build id: {}".format(result_set[-1]["buildid"]))
|
||||
conn.commit()
|
||||
|
||||
|
||||
|
@ -252,7 +246,7 @@ def get_latest_buildset(conn: psycopg2.extensions.connection) -> int:
|
|||
|
||||
def update_buildsets(conn: psycopg2.extensions.connection):
|
||||
start_id = get_latest_buildset(conn)
|
||||
print("Getting buildsets, starting with {}...".format(start_id))
|
||||
logging.info("Getting buildsets, starting with {}...".format(start_id))
|
||||
url = BUILDBOT_URL + "buildsets"
|
||||
cur = conn.cursor()
|
||||
|
||||
|
@ -273,7 +267,7 @@ def update_buildsets(conn: psycopg2.extensions.connection):
|
|||
cur.execute(
|
||||
b"INSERT INTO buildbot_buildsets (buildset_id, data) values " + args_str
|
||||
)
|
||||
print(" {}".format(result_set[-1]["bsid"]))
|
||||
logging.info("last id {}".format(result_set[-1]["bsid"]))
|
||||
conn.commit()
|
||||
|
||||
|
||||
|
@ -288,7 +282,7 @@ def get_latest_buildrequest(conn: psycopg2.extensions.connection) -> int:
|
|||
|
||||
def update_buildrequests(conn: psycopg2.extensions.connection):
|
||||
start_id = get_latest_buildrequest(conn)
|
||||
print("Getting buildrequests, starting with {}...".format(start_id))
|
||||
logging.info("Getting buildrequests, starting with {}...".format(start_id))
|
||||
url = BUILDBOT_URL + "buildrequests"
|
||||
cur = conn.cursor()
|
||||
for result_set in rest_request_iterator(
|
||||
|
@ -313,12 +307,12 @@ def update_buildrequests(conn: psycopg2.extensions.connection):
|
|||
b"INSERT INTO buildbot_buildrequests (buildrequest_id, buildset_id, data) values "
|
||||
+ args_str
|
||||
)
|
||||
print(" {}".format(result_set[-1]["buildrequestid"]))
|
||||
logging.info("{}".format(result_set[-1]["buildrequestid"]))
|
||||
conn.commit()
|
||||
|
||||
|
||||
def buildbot_monitoring():
|
||||
"""Main function of monitoring the phabricator server."""
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level='INFO', format='%(levelname)-7s %(message)s')
|
||||
conn = connect_to_db()
|
||||
create_tables(conn)
|
||||
update_workers(conn)
|
||||
|
@ -326,8 +320,3 @@ def buildbot_monitoring():
|
|||
update_build_status(conn)
|
||||
update_buildsets(conn)
|
||||
update_buildrequests(conn)
|
||||
print("Completed, exiting...")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
buildbot_monitoring()
|
||||
|
|
22
scripts/metrics/connect_db.sh
Executable file
22
scripts/metrics/connect_db.sh
Executable file
|
@ -0,0 +1,22 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright 2021 Google LLC
|
||||
#
|
||||
# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://llvm.org/LICENSE.txt
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# generate statistics on the llvm github repository
|
||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
cd $SCRIPT_DIR
|
||||
# sleep to let proxy to came up
|
||||
cloud_sql_proxy -instances=llvm-premerge-checks:us-central1:buildkite-stats=tcp:0.0.0.0:5432 &
|
||||
sleep 3s
|
||||
$@
|
|
@ -1,5 +1,4 @@
|
|||
import sys
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import logging
|
||||
|
@ -14,6 +13,7 @@ psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
|
|||
|
||||
token = f'Bearer {os.getenv("BUILDKITE_API_TOKEN")}'
|
||||
|
||||
|
||||
def connect():
|
||||
return psycopg2.connect(
|
||||
f"host=127.0.0.1 sslmode=disable dbname=stats user=stats password={os.getenv('DB_PASSWORD')}")
|
||||
|
@ -57,7 +57,8 @@ where a.id IS NULL and j.raw->>'raw_log_url' IS NOT NULL
|
|||
content, en = download_text(url)
|
||||
meta['encoding'] = en
|
||||
with conn.cursor() as i:
|
||||
i.execute('INSERT INTO artifacts (id, job_id, content, meta) VALUES (%s, %s, %s, %s)', [job_id, job_id, content, meta])
|
||||
i.execute('INSERT INTO artifacts (id, job_id, content, meta) VALUES (%s, %s, %s, %s)',
|
||||
[job_id, job_id, content, meta])
|
||||
except:
|
||||
meta['failure'] = traceback.format_exc()
|
||||
logging.error(f'download artifact failed {meta["failure"]} {url}')
|
||||
|
@ -247,7 +248,6 @@ where j.id IS NULL""")
|
|||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(level='INFO', format='%(levelname)-7s %(message)s')
|
||||
print(os.environ)
|
||||
cn = connect()
|
||||
logging.info('downloading buildkite data')
|
||||
insert_new_builds(cn)
|
||||
|
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
echo "loading buildkite data"
|
||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
cd $SCRIPT_DIR
|
||||
pipenv install
|
||||
cloud_sql_proxy -instances=llvm-premerge-checks:us-central1:buildkite-stats=tcp:0.0.0.0:5432 & pipenv run python3 $SCRIPT_DIR/load_buildkite.py
|
|
@ -1,2 +0,0 @@
|
|||
# these files are needed in addition to /scripts/requirments.txt
|
||||
psycopg2
|
|
@ -1,14 +1,15 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import traceback
|
||||
import psycopg2
|
||||
from phabricator import Phabricator
|
||||
import os
|
||||
from typing import Optional
|
||||
import datetime
|
||||
import requests
|
||||
import logging
|
||||
|
||||
PHABRICATOR_URL = "https://reviews.llvm.org/api/"
|
||||
BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2/"
|
||||
BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2"
|
||||
|
||||
|
||||
def phab_up() -> Optional[Phabricator]:
|
||||
|
@ -16,36 +17,37 @@ def phab_up() -> Optional[Phabricator]:
|
|||
|
||||
Returns None if server is down.
|
||||
"""
|
||||
print("Checking Phabricator status...")
|
||||
logging.info("Checking Phabricator status...")
|
||||
try:
|
||||
phab = Phabricator(host=PHABRICATOR_URL)
|
||||
phab = Phabricator(token=os.getenv('CONDUIT_TOKEN'), host=PHABRICATOR_URL)
|
||||
phab.update_interfaces()
|
||||
print(" Phabricator is up.")
|
||||
logging.info("Phabricator is up.")
|
||||
return phab
|
||||
except Exception:
|
||||
pass
|
||||
print(" Phabricator is down.")
|
||||
except Exception as ex:
|
||||
logging.error(ex)
|
||||
logging.error(traceback.format_exc())
|
||||
logging.warning("Phabricator is down.")
|
||||
return None
|
||||
|
||||
|
||||
def buildbot_up() -> bool:
|
||||
"""Check if buildbot server is up"""
|
||||
print("Checking Buildbot status...")
|
||||
logging.info("Checking Buildbot status...")
|
||||
try:
|
||||
response = requests.get(BUILDBOT_URL + "buildrequests?limit=100")
|
||||
if "masters" in response.json():
|
||||
print(" Buildbot is up.")
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
print(" Buildbot is down.")
|
||||
response = requests.get(BUILDBOT_URL)
|
||||
logging.info(f'{response.status_code} {BUILDBOT_URL}')
|
||||
logging.info(response.content)
|
||||
return response.status_code == 200
|
||||
except Exception as ex:
|
||||
logging.error(ex)
|
||||
logging.error(traceback.format_exc())
|
||||
logging.warning("Buildbot is down.")
|
||||
return False
|
||||
|
||||
|
||||
def log_server_status(phab: bool, buildbot: bool, conn: psycopg2.extensions.connection):
|
||||
"""log the phabricator status to the database."""
|
||||
print("Writing Phabricator status to database...")
|
||||
|
||||
logging.info("Writing Phabricator status to database...")
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"INSERT INTO server_status (timestamp, phabricator, buildbot) VALUES (%s,%s,%s);",
|
||||
|
@ -57,10 +59,7 @@ def log_server_status(phab: bool, buildbot: bool, conn: psycopg2.extensions.conn
|
|||
def connect_to_db() -> psycopg2.extensions.connection:
|
||||
"""Connect to the database, create tables as needed."""
|
||||
conn = psycopg2.connect(
|
||||
"host=127.0.0.1 sslmode=disable dbname=stats user={} password={}".format(
|
||||
os.environ["PGUSER"], os.environ["PGPASSWORD"]
|
||||
)
|
||||
)
|
||||
f"host=127.0.0.1 sslmode=disable dbname=phabricator user=stats password={os.getenv('DB_PASSWORD')}")
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
"CREATE TABLE IF NOT EXISTS server_status (timestamp timestamp, phabricator boolean, buildbot boolean);"
|
||||
|
@ -69,14 +68,9 @@ def connect_to_db() -> psycopg2.extensions.connection:
|
|||
return conn
|
||||
|
||||
|
||||
def server_monitoring():
|
||||
"""Main function of monitoring the servers."""
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level='INFO', format='%(levelname)-7s %(message)s')
|
||||
conn = connect_to_db()
|
||||
phab = phab_up()
|
||||
buildbot = buildbot_up()
|
||||
log_server_status(phab is not None, buildbot, conn)
|
||||
print("Completed, exiting...")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
server_monitoring()
|
||||
|
|
Loading…
Reference in a new issue