1
0
Fork 0

cron jobs for buildbot and phab monitoring

+ fix phabricator / buildbot uptime monitoring

+ data is stored in separate databases
This commit is contained in:
Mikhail Goncharov 2021-05-20 18:11:55 +02:00
parent 46b7b8d8b7
commit a44473098c
12 changed files with 239 additions and 82 deletions

View file

@ -19,8 +19,11 @@ ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8 ENV LC_ALL en_US.UTF-8
COPY *.sh /usr/local/bin/ COPY *.sh /usr/local/bin/
# requirements.txt generated by running `pipenv lock -r > ../../containers/stats/requirements.txt` in `scripts/metrics`.
COPY requirements.txt /tmp/
RUN pip install -q -r /tmp/requirements.txt
RUN chmod og+rx /usr/local/bin/*.sh RUN chmod og+rx /usr/local/bin/*.sh
RUN wget https://dl.google.com/cloudsql/cloud_sql_proxy.linux.amd64 -O /usr/local/bin/cloud_sql_proxy;\ RUN wget -q https://dl.google.com/cloudsql/cloud_sql_proxy.linux.amd64 -O /usr/local/bin/cloud_sql_proxy;\
chmod +x /usr/local/bin/cloud_sql_proxy chmod +x /usr/local/bin/cloud_sql_proxy
ENTRYPOINT ["entrypoint.sh"] ENTRYPOINT ["entrypoint.sh"]

View file

@ -17,4 +17,6 @@ set -euo pipefail
git clone --depth 1 https://github.com/google/llvm-premerge-checks.git ~/llvm-premerge-checks git clone --depth 1 https://github.com/google/llvm-premerge-checks.git ~/llvm-premerge-checks
cd ~/llvm-premerge-checks cd ~/llvm-premerge-checks
git fetch origin "${SCRIPTS_REFSPEC:=main}":x
git checkout x
exec /usr/bin/tini -g -- $@ exec /usr/bin/tini -g -- $@

View file

@ -0,0 +1,27 @@
backoff==1.10.0
certifi==2020.12.5
chardet==4.0.0
ftfy==6.0.1; python_version >= '3.6'
gitdb==4.0.7
gitpython==3.1.17
idna==2.10
lxml==4.6.3
mailchecker==4.0.7
pathspec==0.8.1
phabricator==0.8.1
phonenumbers==8.12.23
psycopg2-binary==2.8.6
pyaml==20.4.0
python-benedict==0.24.0
python-dateutil==2.8.1
python-fsutil==0.5.0
python-slugify==5.0.2
pyyaml==5.4.1
requests==2.25.1
six==1.16.0
smmap==4.0.0
text-unidecode==1.3
toml==0.10.2
urllib3==1.26.4
wcwidth==0.2.5
xmltodict==0.12.0

View file

@ -0,0 +1,63 @@
# Copyright 2021 Google LLC
#
# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://llvm.org/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: buildbot-stats
namespace: buildkite
spec:
schedule: "40 * * * *"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 24
failedJobsHistoryLimit: 3
jobTemplate:
spec:
template:
spec:
containers:
- name: collect-buildkite-stats
image: gcr.io/llvm-premerge-checks/stats:latest
args: ["/root/llvm-premerge-checks/scripts/metrics/connect_db.sh python3 buildbot_monitoring.py"]
env:
- name: BUILDKITE_AGENT_TOKEN
valueFrom:
secretKeyRef:
name: buildkite-agent-token
key: token
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: CONDUIT_TOKEN
valueFrom:
secretKeyRef:
name: conduit-api-token
key: token
- name: BUILDKITE_API_TOKEN
valueFrom:
secretKeyRef:
name: buildkite-api-token
key: token
- name: DB_PASSWORD
valueFrom:
secretKeyRef:
name: db-stats
key: password
- name: SCRIPTS_REFSPEC
value: "cron-2"
restartPolicy: Never
nodeSelector:
cloud.google.com/gke-nodepool: service

View file

@ -16,7 +16,7 @@
apiVersion: batch/v1beta1 apiVersion: batch/v1beta1
kind: CronJob kind: CronJob
metadata: metadata:
name: collect-stats name: buildkite-stats
namespace: buildkite namespace: buildkite
spec: spec:
schedule: "0 * * * *" schedule: "0 * * * *"
@ -30,7 +30,7 @@ spec:
containers: containers:
- name: collect-buildkite-stats - name: collect-buildkite-stats
image: gcr.io/llvm-premerge-checks/stats:latest image: gcr.io/llvm-premerge-checks/stats:latest
args: ["/root/llvm-premerge-checks/scripts/metrics/load_buildkite.sh"] args: ["/root/llvm-premerge-checks/scripts/metrics/connect_db.sh python3 load_buildkite.py"]
env: env:
- name: BUILDKITE_AGENT_TOKEN - name: BUILDKITE_AGENT_TOKEN
valueFrom: valueFrom:
@ -56,6 +56,8 @@ spec:
secretKeyRef: secretKeyRef:
name: db-stats name: db-stats
key: password key: password
- name: SCRIPTS_REFSPEC
value: "cron-2"
restartPolicy: Never restartPolicy: Never
nodeSelector: nodeSelector:
cloud.google.com/gke-nodepool: service cloud.google.com/gke-nodepool: service

View file

@ -0,0 +1,63 @@
# Copyright 2021 Google LLC
#
# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://llvm.org/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: uptime-stats
namespace: buildkite
spec:
schedule: "20 * * * *"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 24
failedJobsHistoryLimit: 3
jobTemplate:
spec:
template:
spec:
containers:
- name: collect-buildkite-stats
image: gcr.io/llvm-premerge-checks/stats:latest
args: ["/root/llvm-premerge-checks/scripts/metrics/connect_db.sh python3 server_monitoring.py"]
env:
- name: BUILDKITE_AGENT_TOKEN
valueFrom:
secretKeyRef:
name: buildkite-agent-token
key: token
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: CONDUIT_TOKEN
valueFrom:
secretKeyRef:
name: conduit-api-token
key: token
- name: BUILDKITE_API_TOKEN
valueFrom:
secretKeyRef:
name: buildkite-api-token
key: token
- name: DB_PASSWORD
valueFrom:
secretKeyRef:
name: db-stats
key: password
- name: SCRIPTS_REFSPEC
value: "cron-2"
restartPolicy: Never
nodeSelector:
cloud.google.com/gke-nodepool: service

View file

@ -1,5 +1,5 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import logging
import psycopg2 import psycopg2
import os import os
import datetime import datetime
@ -10,6 +10,7 @@ import json
PHABRICATOR_URL = "https://reviews.llvm.org/api/" PHABRICATOR_URL = "https://reviews.llvm.org/api/"
BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2/" BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2/"
# TODO(kuhnel): retry on connection issues, maybe resuse # TODO(kuhnel): retry on connection issues, maybe resuse
# https://github.com/google/llvm-premerge-checks/blob/main/scripts/phabtalk/phabtalk.py#L44 # https://github.com/google/llvm-premerge-checks/blob/main/scripts/phabtalk/phabtalk.py#L44
@ -18,12 +19,9 @@ BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2/"
def connect_to_db() -> psycopg2.extensions.connection: def connect_to_db() -> psycopg2.extensions.connection:
"""Connect to the database, create tables as needed.""" """Connect to the database."""
conn = psycopg2.connect( conn = psycopg2.connect(
"host=127.0.0.1 sslmode=disable dbname=stats user={} password={}".format( f"host=127.0.0.1 sslmode=disable dbname=buildbots user=stats password={os.getenv('DB_PASSWORD')}")
os.environ["PGUSER"], os.environ["PGPASSWORD"]
)
)
return conn return conn
@ -81,7 +79,7 @@ def create_tables(conn: psycopg2.extensions.connection):
def get_worker_status( def get_worker_status(
worker_id: int, conn: psycopg2.extensions.connection worker_id: int, conn: psycopg2.extensions.connection
) -> Optional[Dict]: ) -> Optional[Dict]:
"""Note: postgres returns a dict for a stored json object.""" """Note: postgres returns a dict for a stored json object."""
cur = conn.cursor() cur = conn.cursor()
@ -96,7 +94,7 @@ def get_worker_status(
def get_builder_status( def get_builder_status(
builder_id: int, conn: psycopg2.extensions.connection builder_id: int, conn: psycopg2.extensions.connection
) -> Optional[Dict]: ) -> Optional[Dict]:
"""Note: postgres returns a dict for a stored json object.""" """Note: postgres returns a dict for a stored json object."""
cur = conn.cursor() cur = conn.cursor()
@ -112,10 +110,10 @@ def get_builder_status(
def set_worker_status( def set_worker_status(
timestamp: datetime.datetime, timestamp: datetime.datetime,
worker_id: int, worker_id: int,
data: str, data: str,
conn: psycopg2.extensions.connection, conn: psycopg2.extensions.connection,
): ):
cur = conn.cursor() cur = conn.cursor()
cur.execute( cur.execute(
@ -126,7 +124,7 @@ def set_worker_status(
def update_workers(conn: psycopg2.extensions.connection): def update_workers(conn: psycopg2.extensions.connection):
print("Updating worker status...") logging.info("Updating worker status...")
response = requests.get(BUILDBOT_URL + "workers") response = requests.get(BUILDBOT_URL + "workers")
timestamp = datetime.datetime.now() timestamp = datetime.datetime.now()
for worker in response.json()["workers"]: for worker in response.json()["workers"]:
@ -144,7 +142,7 @@ def update_workers(conn: psycopg2.extensions.connection):
def update_builders(conn: psycopg2.extensions.connection): def update_builders(conn: psycopg2.extensions.connection):
"""get list of all builder ids.""" """get list of all builder ids."""
print("Updating builder status...") logging.info("Updating builder status...")
response = requests.get(BUILDBOT_URL + "builders") response = requests.get(BUILDBOT_URL + "builders")
timestamp = datetime.datetime.now() timestamp = datetime.datetime.now()
for builder in response.json()["builders"]: for builder in response.json()["builders"]:
@ -174,13 +172,10 @@ def get_last_build(conn: psycopg2.extensions.connection) -> int:
def update_build_status(conn: psycopg2.extensions.connection): def update_build_status(conn: psycopg2.extensions.connection):
start_id = get_last_build(conn) start_id = get_last_build(conn)
print("Updating build results, starting with {}...".format(start_id)) logging.info("Updating build results, starting with {}...".format(start_id))
url = BUILDBOT_URL + "builds" url = BUILDBOT_URL + "builds"
cur = conn.cursor() cur = conn.cursor()
for result_set in rest_request_iterator(url, "builds", "buildid", start_id=start_id):
for result_set in rest_request_iterator(
url, "builds", "buildid", start_id=start_id
):
args_str = b",".join( args_str = b",".join(
cur.mogrify( cur.mogrify(
b" (%s,%s,%s,%s) ", b" (%s,%s,%s,%s) ",
@ -194,21 +189,20 @@ def update_build_status(conn: psycopg2.extensions.connection):
for build in result_set for build in result_set
if build["complete"] if build["complete"]
) )
cur.execute( cur.execute(
b"INSERT INTO buildbot_builds (build_id, builder_id, build_number, build_data) values " b"INSERT INTO buildbot_builds (build_id, builder_id, build_number, build_data) values "
+ args_str + args_str
) )
print(" {}".format(result_set[-1]["buildid"])) logging.info("last build id: {}".format(result_set[-1]["buildid"]))
conn.commit() conn.commit()
def rest_request_iterator( def rest_request_iterator(
url: str, url: str,
array_field_name: str, array_field_name: str,
id_field_name: str, id_field_name: str,
start_id: int = 0, start_id: int = 0,
step: int = 1000, step: int = 1000,
): ):
"""Request paginated data from the buildbot master. """Request paginated data from the buildbot master.
@ -252,12 +246,12 @@ def get_latest_buildset(conn: psycopg2.extensions.connection) -> int:
def update_buildsets(conn: psycopg2.extensions.connection): def update_buildsets(conn: psycopg2.extensions.connection):
start_id = get_latest_buildset(conn) start_id = get_latest_buildset(conn)
print("Getting buildsets, starting with {}...".format(start_id)) logging.info("Getting buildsets, starting with {}...".format(start_id))
url = BUILDBOT_URL + "buildsets" url = BUILDBOT_URL + "buildsets"
cur = conn.cursor() cur = conn.cursor()
for result_set in rest_request_iterator( for result_set in rest_request_iterator(
url, "buildsets", "bsid", start_id=start_id url, "buildsets", "bsid", start_id=start_id
): ):
args_str = b",".join( args_str = b",".join(
cur.mogrify( cur.mogrify(
@ -273,7 +267,7 @@ def update_buildsets(conn: psycopg2.extensions.connection):
cur.execute( cur.execute(
b"INSERT INTO buildbot_buildsets (buildset_id, data) values " + args_str b"INSERT INTO buildbot_buildsets (buildset_id, data) values " + args_str
) )
print(" {}".format(result_set[-1]["bsid"])) logging.info("last id {}".format(result_set[-1]["bsid"]))
conn.commit() conn.commit()
@ -288,11 +282,11 @@ def get_latest_buildrequest(conn: psycopg2.extensions.connection) -> int:
def update_buildrequests(conn: psycopg2.extensions.connection): def update_buildrequests(conn: psycopg2.extensions.connection):
start_id = get_latest_buildrequest(conn) start_id = get_latest_buildrequest(conn)
print("Getting buildrequests, starting with {}...".format(start_id)) logging.info("Getting buildrequests, starting with {}...".format(start_id))
url = BUILDBOT_URL + "buildrequests" url = BUILDBOT_URL + "buildrequests"
cur = conn.cursor() cur = conn.cursor()
for result_set in rest_request_iterator( for result_set in rest_request_iterator(
url, "buildrequests", "buildrequestid", start_id=start_id url, "buildrequests", "buildrequestid", start_id=start_id
): ):
# cur.mogrify returns a byte string, so we need to join on a byte string # cur.mogrify returns a byte string, so we need to join on a byte string
args_str = b",".join( args_str = b",".join(
@ -313,12 +307,12 @@ def update_buildrequests(conn: psycopg2.extensions.connection):
b"INSERT INTO buildbot_buildrequests (buildrequest_id, buildset_id, data) values " b"INSERT INTO buildbot_buildrequests (buildrequest_id, buildset_id, data) values "
+ args_str + args_str
) )
print(" {}".format(result_set[-1]["buildrequestid"])) logging.info("{}".format(result_set[-1]["buildrequestid"]))
conn.commit() conn.commit()
def buildbot_monitoring(): if __name__ == "__main__":
"""Main function of monitoring the phabricator server.""" logging.basicConfig(level='INFO', format='%(levelname)-7s %(message)s')
conn = connect_to_db() conn = connect_to_db()
create_tables(conn) create_tables(conn)
update_workers(conn) update_workers(conn)
@ -326,8 +320,3 @@ def buildbot_monitoring():
update_build_status(conn) update_build_status(conn)
update_buildsets(conn) update_buildsets(conn)
update_buildrequests(conn) update_buildrequests(conn)
print("Completed, exiting...")
if __name__ == "__main__":
buildbot_monitoring()

22
scripts/metrics/connect_db.sh Executable file
View file

@ -0,0 +1,22 @@
#!/usr/bin/env bash
# Copyright 2021 Google LLC
#
# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://llvm.org/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# generate statistics on the llvm github repository
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
cd $SCRIPT_DIR
# sleep to let proxy to came up
cloud_sql_proxy -instances=llvm-premerge-checks:us-central1:buildkite-stats=tcp:0.0.0.0:5432 &
sleep 3s
$@

View file

@ -1,5 +1,4 @@
import sys import sys
import psycopg2 import psycopg2
import psycopg2.extras import psycopg2.extras
import logging import logging
@ -14,6 +13,7 @@ psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
token = f'Bearer {os.getenv("BUILDKITE_API_TOKEN")}' token = f'Bearer {os.getenv("BUILDKITE_API_TOKEN")}'
def connect(): def connect():
return psycopg2.connect( return psycopg2.connect(
f"host=127.0.0.1 sslmode=disable dbname=stats user=stats password={os.getenv('DB_PASSWORD')}") f"host=127.0.0.1 sslmode=disable dbname=stats user=stats password={os.getenv('DB_PASSWORD')}")
@ -57,7 +57,8 @@ where a.id IS NULL and j.raw->>'raw_log_url' IS NOT NULL
content, en = download_text(url) content, en = download_text(url)
meta['encoding'] = en meta['encoding'] = en
with conn.cursor() as i: with conn.cursor() as i:
i.execute('INSERT INTO artifacts (id, job_id, content, meta) VALUES (%s, %s, %s, %s)', [job_id, job_id, content, meta]) i.execute('INSERT INTO artifacts (id, job_id, content, meta) VALUES (%s, %s, %s, %s)',
[job_id, job_id, content, meta])
except: except:
meta['failure'] = traceback.format_exc() meta['failure'] = traceback.format_exc()
logging.error(f'download artifact failed {meta["failure"]} {url}') logging.error(f'download artifact failed {meta["failure"]} {url}')
@ -247,7 +248,6 @@ where j.id IS NULL""")
if __name__ == '__main__': if __name__ == '__main__':
logging.basicConfig(level='INFO', format='%(levelname)-7s %(message)s') logging.basicConfig(level='INFO', format='%(levelname)-7s %(message)s')
print(os.environ)
cn = connect() cn = connect()
logging.info('downloading buildkite data') logging.info('downloading buildkite data')
insert_new_builds(cn) insert_new_builds(cn)

View file

@ -1,6 +0,0 @@
#!/usr/bin/env bash
echo "loading buildkite data"
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
cd $SCRIPT_DIR
pipenv install
cloud_sql_proxy -instances=llvm-premerge-checks:us-central1:buildkite-stats=tcp:0.0.0.0:5432 & pipenv run python3 $SCRIPT_DIR/load_buildkite.py

View file

@ -1,2 +0,0 @@
# these files are needed in addition to /scripts/requirments.txt
psycopg2

View file

@ -1,14 +1,15 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import traceback
import psycopg2 import psycopg2
from phabricator import Phabricator from phabricator import Phabricator
import os import os
from typing import Optional from typing import Optional
import datetime import datetime
import requests import requests
import logging
PHABRICATOR_URL = "https://reviews.llvm.org/api/" PHABRICATOR_URL = "https://reviews.llvm.org/api/"
BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2/" BUILDBOT_URL = "https://lab.llvm.org/buildbot/api/v2"
def phab_up() -> Optional[Phabricator]: def phab_up() -> Optional[Phabricator]:
@ -16,36 +17,37 @@ def phab_up() -> Optional[Phabricator]:
Returns None if server is down. Returns None if server is down.
""" """
print("Checking Phabricator status...") logging.info("Checking Phabricator status...")
try: try:
phab = Phabricator(host=PHABRICATOR_URL) phab = Phabricator(token=os.getenv('CONDUIT_TOKEN'), host=PHABRICATOR_URL)
phab.update_interfaces() phab.update_interfaces()
print(" Phabricator is up.") logging.info("Phabricator is up.")
return phab return phab
except Exception: except Exception as ex:
pass logging.error(ex)
print(" Phabricator is down.") logging.error(traceback.format_exc())
logging.warning("Phabricator is down.")
return None return None
def buildbot_up() -> bool: def buildbot_up() -> bool:
"""Check if buildbot server is up""" """Check if buildbot server is up"""
print("Checking Buildbot status...") logging.info("Checking Buildbot status...")
try: try:
response = requests.get(BUILDBOT_URL + "buildrequests?limit=100") response = requests.get(BUILDBOT_URL)
if "masters" in response.json(): logging.info(f'{response.status_code} {BUILDBOT_URL}')
print(" Buildbot is up.") logging.info(response.content)
return True return response.status_code == 200
except Exception: except Exception as ex:
pass logging.error(ex)
print(" Buildbot is down.") logging.error(traceback.format_exc())
logging.warning("Buildbot is down.")
return False return False
def log_server_status(phab: bool, buildbot: bool, conn: psycopg2.extensions.connection): def log_server_status(phab: bool, buildbot: bool, conn: psycopg2.extensions.connection):
"""log the phabricator status to the database.""" """log the phabricator status to the database."""
print("Writing Phabricator status to database...") logging.info("Writing Phabricator status to database...")
cur = conn.cursor() cur = conn.cursor()
cur.execute( cur.execute(
"INSERT INTO server_status (timestamp, phabricator, buildbot) VALUES (%s,%s,%s);", "INSERT INTO server_status (timestamp, phabricator, buildbot) VALUES (%s,%s,%s);",
@ -57,10 +59,7 @@ def log_server_status(phab: bool, buildbot: bool, conn: psycopg2.extensions.conn
def connect_to_db() -> psycopg2.extensions.connection: def connect_to_db() -> psycopg2.extensions.connection:
"""Connect to the database, create tables as needed.""" """Connect to the database, create tables as needed."""
conn = psycopg2.connect( conn = psycopg2.connect(
"host=127.0.0.1 sslmode=disable dbname=stats user={} password={}".format( f"host=127.0.0.1 sslmode=disable dbname=phabricator user=stats password={os.getenv('DB_PASSWORD')}")
os.environ["PGUSER"], os.environ["PGPASSWORD"]
)
)
cur = conn.cursor() cur = conn.cursor()
cur.execute( cur.execute(
"CREATE TABLE IF NOT EXISTS server_status (timestamp timestamp, phabricator boolean, buildbot boolean);" "CREATE TABLE IF NOT EXISTS server_status (timestamp timestamp, phabricator boolean, buildbot boolean);"
@ -69,14 +68,9 @@ def connect_to_db() -> psycopg2.extensions.connection:
return conn return conn
def server_monitoring(): if __name__ == "__main__":
"""Main function of monitoring the servers.""" logging.basicConfig(level='INFO', format='%(levelname)-7s %(message)s')
conn = connect_to_db() conn = connect_to_db()
phab = phab_up() phab = phab_up()
buildbot = buildbot_up() buildbot = buildbot_up()
log_server_status(phab is not None, buildbot, conn) log_server_status(phab is not None, buildbot, conn)
print("Completed, exiting...")
if __name__ == "__main__":
server_monitoring()