1
0
Fork 0

cron jobs for buildbot and phab monitoring

+ fix phabricator / buildbot uptime monitoring

+ data is stored in separate databases
This commit is contained in:
Mikhail Goncharov 2021-05-20 18:11:55 +02:00
parent 46b7b8d8b7
commit a44473098c
12 changed files with 239 additions and 82 deletions

View file

@ -19,8 +19,11 @@ ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
COPY *.sh /usr/local/bin/
# requirements.txt generated by running `pipenv lock -r > ../../containers/stats/requirements.txt` in `scripts/metrics`.
COPY requirements.txt /tmp/
RUN pip install -q -r /tmp/requirements.txt
RUN chmod og+rx /usr/local/bin/*.sh
RUN wget https://dl.google.com/cloudsql/cloud_sql_proxy.linux.amd64 -O /usr/local/bin/cloud_sql_proxy;\
RUN wget -q https://dl.google.com/cloudsql/cloud_sql_proxy.linux.amd64 -O /usr/local/bin/cloud_sql_proxy;\
chmod +x /usr/local/bin/cloud_sql_proxy
ENTRYPOINT ["entrypoint.sh"]

View file

@ -17,4 +17,6 @@ set -euo pipefail
git clone --depth 1 https://github.com/google/llvm-premerge-checks.git ~/llvm-premerge-checks
cd ~/llvm-premerge-checks
git fetch origin "${SCRIPTS_REFSPEC:=main}":x
git checkout x
exec /usr/bin/tini -g -- $@

View file

@ -0,0 +1,27 @@
backoff==1.10.0
certifi==2020.12.5
chardet==4.0.0
ftfy==6.0.1; python_version >= '3.6'
gitdb==4.0.7
gitpython==3.1.17
idna==2.10
lxml==4.6.3
mailchecker==4.0.7
pathspec==0.8.1
phabricator==0.8.1
phonenumbers==8.12.23
psycopg2-binary==2.8.6
pyaml==20.4.0
python-benedict==0.24.0
python-dateutil==2.8.1
python-fsutil==0.5.0
python-slugify==5.0.2
pyyaml==5.4.1
requests==2.25.1
six==1.16.0
smmap==4.0.0
text-unidecode==1.3
toml==0.10.2
urllib3==1.26.4
wcwidth==0.2.5
xmltodict==0.12.0

View file

@ -0,0 +1,63 @@
# Copyright 2021 Google LLC
#
# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://llvm.org/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: buildbot-stats
namespace: buildkite
spec:
schedule: "40 * * * *"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 24
failedJobsHistoryLimit: 3
jobTemplate:
spec:
template:
spec:
containers:
- name: collect-buildkite-stats
image: gcr.io/llvm-premerge-checks/stats:latest
args: ["/root/llvm-premerge-checks/scripts/metrics/connect_db.sh python3 buildbot_monitoring.py"]
env:
- name: BUILDKITE_AGENT_TOKEN
valueFrom:
secretKeyRef:
name: buildkite-agent-token
key: token
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: CONDUIT_TOKEN
valueFrom:
secretKeyRef:
name: conduit-api-token
key: token
- name: BUILDKITE_API_TOKEN
valueFrom:
secretKeyRef:
name: buildkite-api-token
key: token
- name: DB_PASSWORD
valueFrom:
secretKeyRef:
name: db-stats
key: password
- name: SCRIPTS_REFSPEC
value: "cron-2"
restartPolicy: Never
nodeSelector:
cloud.google.com/gke-nodepool: service

View file

@ -16,7 +16,7 @@
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: collect-stats
name: buildkite-stats
namespace: buildkite
spec:
schedule: "0 * * * *"
@ -30,7 +30,7 @@ spec:
containers:
- name: collect-buildkite-stats
image: gcr.io/llvm-premerge-checks/stats:latest
args: ["/root/llvm-premerge-checks/scripts/metrics/load_buildkite.sh"]
args: ["/root/llvm-premerge-checks/scripts/metrics/connect_db.sh python3 load_buildkite.py"]
env:
- name: BUILDKITE_AGENT_TOKEN
valueFrom:
@ -56,6 +56,8 @@ spec:
secretKeyRef:
name: db-stats
key: password
- name: SCRIPTS_REFSPEC
value: "cron-2"
restartPolicy: Never
nodeSelector:
cloud.google.com/gke-nodepool: service

View file

@ -0,0 +1,63 @@
# Copyright 2021 Google LLC
#
# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://llvm.org/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: uptime-stats
namespace: buildkite
spec:
schedule: "20 * * * *"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 24
failedJobsHistoryLimit: 3
jobTemplate:
spec:
template:
spec:
containers:
- name: collect-buildkite-stats
image: gcr.io/llvm-premerge-checks/stats:latest
args: ["/root/llvm-premerge-checks/scripts/metrics/connect_db.sh python3 server_monitoring.py"]
env:
- name: BUILDKITE_AGENT_TOKEN
valueFrom:
secretKeyRef:
name: buildkite-agent-token
key: token
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: CONDUIT_TOKEN
valueFrom:
secretKeyRef:
name: conduit-api-token
key: token
- name: BUILDKITE_API_TOKEN
valueFrom:
secretKeyRef:
name: buildkite-api-token
key: token
- name: DB_PASSWORD
valueFrom:
secretKeyRef:
name: db-stats
key: password
- name: SCRIPTS_REFSPEC
value: "cron-2"
restartPolicy: Never
nodeSelector:
cloud.google.com/gke-nodepool: service

View file

@ -1,5 +1,5 @@
#!/usr/bin/env python3
import logging
import psycopg2
import os
import datetime
@ -10,6 +10,7 @@ import json
PHABRICATOR_URL = "https://reviews.llvm.org/api/"
BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2/"
# TODO(kuhnel): retry on connection issues, maybe resuse
# https://github.com/google/llvm-premerge-checks/blob/main/scripts/phabtalk/phabtalk.py#L44
@ -18,12 +19,9 @@ BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2/"
def connect_to_db() -> psycopg2.extensions.connection:
"""Connect to the database, create tables as needed."""
"""Connect to the database."""
conn = psycopg2.connect(
"host=127.0.0.1 sslmode=disable dbname=stats user={} password={}".format(
os.environ["PGUSER"], os.environ["PGPASSWORD"]
)
)
f"host=127.0.0.1 sslmode=disable dbname=buildbots user=stats password={os.getenv('DB_PASSWORD')}")
return conn
@ -126,7 +124,7 @@ def set_worker_status(
def update_workers(conn: psycopg2.extensions.connection):
print("Updating worker status...")
logging.info("Updating worker status...")
response = requests.get(BUILDBOT_URL + "workers")
timestamp = datetime.datetime.now()
for worker in response.json()["workers"]:
@ -144,7 +142,7 @@ def update_workers(conn: psycopg2.extensions.connection):
def update_builders(conn: psycopg2.extensions.connection):
"""get list of all builder ids."""
print("Updating builder status...")
logging.info("Updating builder status...")
response = requests.get(BUILDBOT_URL + "builders")
timestamp = datetime.datetime.now()
for builder in response.json()["builders"]:
@ -174,13 +172,10 @@ def get_last_build(conn: psycopg2.extensions.connection) -> int:
def update_build_status(conn: psycopg2.extensions.connection):
start_id = get_last_build(conn)
print("Updating build results, starting with {}...".format(start_id))
logging.info("Updating build results, starting with {}...".format(start_id))
url = BUILDBOT_URL + "builds"
cur = conn.cursor()
for result_set in rest_request_iterator(
url, "builds", "buildid", start_id=start_id
):
for result_set in rest_request_iterator(url, "builds", "buildid", start_id=start_id):
args_str = b",".join(
cur.mogrify(
b" (%s,%s,%s,%s) ",
@ -194,12 +189,11 @@ def update_build_status(conn: psycopg2.extensions.connection):
for build in result_set
if build["complete"]
)
cur.execute(
b"INSERT INTO buildbot_builds (build_id, builder_id, build_number, build_data) values "
+ args_str
)
print(" {}".format(result_set[-1]["buildid"]))
logging.info("last build id: {}".format(result_set[-1]["buildid"]))
conn.commit()
@ -252,7 +246,7 @@ def get_latest_buildset(conn: psycopg2.extensions.connection) -> int:
def update_buildsets(conn: psycopg2.extensions.connection):
start_id = get_latest_buildset(conn)
print("Getting buildsets, starting with {}...".format(start_id))
logging.info("Getting buildsets, starting with {}...".format(start_id))
url = BUILDBOT_URL + "buildsets"
cur = conn.cursor()
@ -273,7 +267,7 @@ def update_buildsets(conn: psycopg2.extensions.connection):
cur.execute(
b"INSERT INTO buildbot_buildsets (buildset_id, data) values " + args_str
)
print(" {}".format(result_set[-1]["bsid"]))
logging.info("last id {}".format(result_set[-1]["bsid"]))
conn.commit()
@ -288,7 +282,7 @@ def get_latest_buildrequest(conn: psycopg2.extensions.connection) -> int:
def update_buildrequests(conn: psycopg2.extensions.connection):
start_id = get_latest_buildrequest(conn)
print("Getting buildrequests, starting with {}...".format(start_id))
logging.info("Getting buildrequests, starting with {}...".format(start_id))
url = BUILDBOT_URL + "buildrequests"
cur = conn.cursor()
for result_set in rest_request_iterator(
@ -313,12 +307,12 @@ def update_buildrequests(conn: psycopg2.extensions.connection):
b"INSERT INTO buildbot_buildrequests (buildrequest_id, buildset_id, data) values "
+ args_str
)
print(" {}".format(result_set[-1]["buildrequestid"]))
logging.info("{}".format(result_set[-1]["buildrequestid"]))
conn.commit()
def buildbot_monitoring():
"""Main function of monitoring the phabricator server."""
if __name__ == "__main__":
logging.basicConfig(level='INFO', format='%(levelname)-7s %(message)s')
conn = connect_to_db()
create_tables(conn)
update_workers(conn)
@ -326,8 +320,3 @@ def buildbot_monitoring():
update_build_status(conn)
update_buildsets(conn)
update_buildrequests(conn)
print("Completed, exiting...")
if __name__ == "__main__":
buildbot_monitoring()

22
scripts/metrics/connect_db.sh Executable file
View file

@ -0,0 +1,22 @@
#!/usr/bin/env bash
# Copyright 2021 Google LLC
#
# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://llvm.org/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# generate statistics on the llvm github repository
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
cd $SCRIPT_DIR
# sleep to let proxy to came up
cloud_sql_proxy -instances=llvm-premerge-checks:us-central1:buildkite-stats=tcp:0.0.0.0:5432 &
sleep 3s
$@

View file

@ -1,5 +1,4 @@
import sys
import psycopg2
import psycopg2.extras
import logging
@ -14,6 +13,7 @@ psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
token = f'Bearer {os.getenv("BUILDKITE_API_TOKEN")}'
def connect():
return psycopg2.connect(
f"host=127.0.0.1 sslmode=disable dbname=stats user=stats password={os.getenv('DB_PASSWORD')}")
@ -57,7 +57,8 @@ where a.id IS NULL and j.raw->>'raw_log_url' IS NOT NULL
content, en = download_text(url)
meta['encoding'] = en
with conn.cursor() as i:
i.execute('INSERT INTO artifacts (id, job_id, content, meta) VALUES (%s, %s, %s, %s)', [job_id, job_id, content, meta])
i.execute('INSERT INTO artifacts (id, job_id, content, meta) VALUES (%s, %s, %s, %s)',
[job_id, job_id, content, meta])
except:
meta['failure'] = traceback.format_exc()
logging.error(f'download artifact failed {meta["failure"]} {url}')
@ -247,7 +248,6 @@ where j.id IS NULL""")
if __name__ == '__main__':
logging.basicConfig(level='INFO', format='%(levelname)-7s %(message)s')
print(os.environ)
cn = connect()
logging.info('downloading buildkite data')
insert_new_builds(cn)

View file

@ -1,6 +0,0 @@
#!/usr/bin/env bash
echo "loading buildkite data"
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
cd $SCRIPT_DIR
pipenv install
cloud_sql_proxy -instances=llvm-premerge-checks:us-central1:buildkite-stats=tcp:0.0.0.0:5432 & pipenv run python3 $SCRIPT_DIR/load_buildkite.py

View file

@ -1,2 +0,0 @@
# these files are needed in addition to /scripts/requirments.txt
psycopg2

View file

@ -1,14 +1,15 @@
#!/usr/bin/env python3
import traceback
import psycopg2
from phabricator import Phabricator
import os
from typing import Optional
import datetime
import requests
import logging
PHABRICATOR_URL = "https://reviews.llvm.org/api/"
BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2/"
BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2"
def phab_up() -> Optional[Phabricator]:
@ -16,36 +17,37 @@ def phab_up() -> Optional[Phabricator]:
Returns None if server is down.
"""
print("Checking Phabricator status...")
logging.info("Checking Phabricator status...")
try:
phab = Phabricator(host=PHABRICATOR_URL)
phab = Phabricator(token=os.getenv('CONDUIT_TOKEN'), host=PHABRICATOR_URL)
phab.update_interfaces()
print(" Phabricator is up.")
logging.info("Phabricator is up.")
return phab
except Exception:
pass
print(" Phabricator is down.")
except Exception as ex:
logging.error(ex)
logging.error(traceback.format_exc())
logging.warning("Phabricator is down.")
return None
def buildbot_up() -> bool:
"""Check if buildbot server is up"""
print("Checking Buildbot status...")
logging.info("Checking Buildbot status...")
try:
response = requests.get(BUILDBOT_URL + "buildrequests?limit=100")
if "masters" in response.json():
print(" Buildbot is up.")
return True
except Exception:
pass
print(" Buildbot is down.")
response = requests.get(BUILDBOT_URL)
logging.info(f'{response.status_code} {BUILDBOT_URL}')
logging.info(response.content)
return response.status_code == 200
except Exception as ex:
logging.error(ex)
logging.error(traceback.format_exc())
logging.warning("Buildbot is down.")
return False
def log_server_status(phab: bool, buildbot: bool, conn: psycopg2.extensions.connection):
"""log the phabricator status to the database."""
print("Writing Phabricator status to database...")
logging.info("Writing Phabricator status to database...")
cur = conn.cursor()
cur.execute(
"INSERT INTO server_status (timestamp, phabricator, buildbot) VALUES (%s,%s,%s);",
@ -57,10 +59,7 @@ def log_server_status(phab: bool, buildbot: bool, conn: psycopg2.extensions.conn
def connect_to_db() -> psycopg2.extensions.connection:
"""Connect to the database, create tables as needed."""
conn = psycopg2.connect(
"host=127.0.0.1 sslmode=disable dbname=stats user={} password={}".format(
os.environ["PGUSER"], os.environ["PGPASSWORD"]
)
)
f"host=127.0.0.1 sslmode=disable dbname=phabricator user=stats password={os.getenv('DB_PASSWORD')}")
cur = conn.cursor()
cur.execute(
"CREATE TABLE IF NOT EXISTS server_status (timestamp timestamp, phabricator boolean, buildbot boolean);"
@ -69,14 +68,9 @@ def connect_to_db() -> psycopg2.extensions.connection:
return conn
def server_monitoring():
"""Main function of monitoring the servers."""
if __name__ == "__main__":
logging.basicConfig(level='INFO', format='%(levelname)-7s %(message)s')
conn = connect_to_db()
phab = phab_up()
buildbot = buildbot_up()
log_server_status(phab is not None, buildbot, conn)
print("Completed, exiting...")
if __name__ == "__main__":
server_monitoring()