From e703a856cb31b7ed4f29f5f94c78d39cc3090e50 Mon Sep 17 00:00:00 2001 From: Mikhail Goncharov Date: Fri, 14 May 2021 14:54:57 +0200 Subject: [PATCH] Add graceful termination period for linux agents Now builds will not be dropped on cluster upgrades. That requires container updates. --- .../buildkite-premerge-debian/Dockerfile | 6 +- .../{start_agent.sh => entrypoint.sh} | 21 ++--- docs/playbooks.md | 2 +- kubernetes/buildkite/linux-agents-test.yaml | 80 +++++++++++++++++++ kubernetes/buildkite/linux-agents.yaml | 7 +- kubernetes/buildkite/service-agents-test.yaml | 73 +++++++++++++++++ kubernetes/buildkite/service-agents.yaml | 5 +- 7 files changed, 173 insertions(+), 21 deletions(-) rename containers/buildkite-premerge-debian/{start_agent.sh => entrypoint.sh} (68%) create mode 100644 kubernetes/buildkite/linux-agents-test.yaml create mode 100644 kubernetes/buildkite/service-agents-test.yaml diff --git a/containers/buildkite-premerge-debian/Dockerfile b/containers/buildkite-premerge-debian/Dockerfile index 5423886..f22134d 100644 --- a/containers/buildkite-premerge-debian/Dockerfile +++ b/containers/buildkite-premerge-debian/Dockerfile @@ -5,9 +5,11 @@ RUN echo 'install buildkite' ;\ sh -c 'echo deb https://apt.buildkite.com/buildkite-agent stable main > /etc/apt/sources.list.d/buildkite-agent.list' ;\ apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 32A37959C2FA5C3C99EFBC32A79206696452D198 ;\ apt-get update ;\ - apt-get install -y buildkite-agent; \ + apt-get install -y buildkite-agent tini; \ apt-get clean; COPY *.sh /usr/local/bin/ RUN chmod og+rx /usr/local/bin/*.sh COPY --chown=buildkite-agent:buildkite-agent pre-checkout /etc/buildkite-agent/hooks -CMD ["start_agent.sh"] \ No newline at end of file + +ENTRYPOINT ["entrypoint.sh"] +CMD ["buildkite-agent", "start", "--no-color"] \ No newline at end of file diff --git a/containers/buildkite-premerge-debian/start_agent.sh b/containers/buildkite-premerge-debian/entrypoint.sh similarity index 68% rename from containers/buildkite-premerge-debian/start_agent.sh rename to containers/buildkite-premerge-debian/entrypoint.sh index 698cc1f..9be6285 100755 --- a/containers/buildkite-premerge-debian/start_agent.sh +++ b/containers/buildkite-premerge-debian/entrypoint.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash -# Copyright 2020 Google LLC + +# Copyright 2021 Google LLC # # Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License"); # you may not use this file except in compliance with the License. @@ -12,11 +13,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +set -euo pipefail -# Buildkite installation creates 'buildkite-agent' user. USER=buildkite-agent - -# prepare work directory mkdir -p "${BUILDKITE_BUILD_PATH}" chown -R ${USER}:${USER} "${BUILDKITE_BUILD_PATH}" @@ -26,12 +25,8 @@ mkdir -p "${CCACHE_DIR}" chown -R ${USER}:${USER} "${CCACHE_DIR}" # /mnt/ssh should contain known_hosts, id_rsa and id_rsa.pub . -mkdir -p /var/lib/buildkite-agent/.ssh -cp /mnt/ssh/* /var/lib/buildkite-agent/.ssh -chmod 700 /var/lib/buildkite-agent/.ssh -chmod 600 /var/lib/buildkite-agent/.ssh/* -chown -R $USER:$USER /var/lib/buildkite-agent/.ssh - -su buildkite-agent -c "buildkite-agent start" -echo "agent exited" -sleep 10m \ No newline at end of file +mkdir -p ~/.ssh +chmod 700 ~/.ssh +cp /mnt/ssh/* ~/.ssh +chmod 600 ~/.ssh/* +exec /usr/bin/tini -g -- $@ \ No newline at end of file diff --git a/docs/playbooks.md b/docs/playbooks.md index e3e8926..65c7f2f 100644 --- a/docs/playbooks.md +++ b/docs/playbooks.md @@ -190,7 +190,7 @@ Most commonly used are: - `ph_projects`: which projects to use, "detect" will look on diff to infer the projects, "default" selects all projects. - `ph_notify_email`: comma-separated list of email addresses to be notified when build is complete. - `ph_log_level` ("DEBUG", "INFO", "WARNING" (default) or "ERROR"): log level for build scripts. -- `ph_linux_agents`, `ph_windows_agents`: custom JSON constraints on agents. For example, you might put one machine to a custom queue if it's errornous and send jobs to it with `ph_windows_agents="{{\"queue\": \"custom\"}}"`. +- `ph_linux_agents`, `ph_windows_agents`: custom JSON constraints on agents. For example, you might put one machine to a custom queue if it's errornous and send jobs to it with `ph_windows_agents={"queue": "custom"}`. - `ph_skip_linux`, `ph_skip_windows` (if set to any value): skip build on this OS. - `ph_skip_generated`: don't run custom steps generated from within llvm-project. diff --git a/kubernetes/buildkite/linux-agents-test.yaml b/kubernetes/buildkite/linux-agents-test.yaml new file mode 100644 index 0000000..b382d00 --- /dev/null +++ b/kubernetes/buildkite/linux-agents-test.yaml @@ -0,0 +1,80 @@ +# Copyright 2021 Google LLC +# +# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://llvm.org/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: linux-agents-test + namespace: buildkite +spec: + replicas: 1 + selector: + matchLabels: + app: agent-premerge-debian + template: + metadata: + labels: + app: agent-premerge-debian + spec: + containers: + - name: buildkite-premerge-debian + image: gcr.io/llvm-premerge-checks/buildkite-premerge-debian:latest + resources: + limits: + cpu: 15 + memory: 50Gi + requests: + cpu: 15 + memory: 50Gi + volumeMounts: + - name: ssd + mountPath: /mnt/disks/ssd0 + - name: github-ssh + mountPath: /mnt/ssh + env: + - name: BUILDKITE_AGENT_TOKEN + valueFrom: + secretKeyRef: + name: buildkite-agent-token + key: token + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: BUILDKITE_AGENT_TAGS + value: "queue=linux-test,name=$(POD_NAME)" + - name: BUILDKITE_BUILD_PATH + value: "/mnt/disks/ssd0/agent" + - name: CONDUIT_TOKEN + valueFrom: + secretKeyRef: + name: conduit-api-token + key: token + - name: BUILDKITE_API_TOKEN + valueFrom: + secretKeyRef: + name: buildkite-api-token-readonly + key: token + volumes: + - name: ssd + hostPath: + # directory location on host + path: /mnt/disks/ssd0 + type: Directory + - name: github-ssh + secret: + secretName: github-ssh + nodeSelector: + cloud.google.com/gke-nodepool: linux-agents + terminationGracePeriodSeconds: 3600 \ No newline at end of file diff --git a/kubernetes/buildkite/linux-agents.yaml b/kubernetes/buildkite/linux-agents.yaml index 51d2499..9c42af3 100644 --- a/kubernetes/buildkite/linux-agents.yaml +++ b/kubernetes/buildkite/linux-agents.yaml @@ -22,7 +22,7 @@ spec: strategy: rollingUpdate: maxSurge: 25% - maxUnavailable: 25% + maxUnavailable: 50% type: RollingUpdate selector: matchLabels: @@ -34,7 +34,7 @@ spec: spec: containers: - name: buildkite-premerge-debian - image: gcr.io/llvm-premerge-checks/buildkite-premerge-debian + image: gcr.io/llvm-premerge-checks/buildkite-premerge-debian:stable resources: limits: cpu: 30 @@ -81,4 +81,5 @@ spec: secret: secretName: github-ssh nodeSelector: - cloud.google.com/gke-nodepool: linux-agents \ No newline at end of file + cloud.google.com/gke-nodepool: linux-agents + terminationGracePeriodSeconds: 3600 \ No newline at end of file diff --git a/kubernetes/buildkite/service-agents-test.yaml b/kubernetes/buildkite/service-agents-test.yaml new file mode 100644 index 0000000..acaebd8 --- /dev/null +++ b/kubernetes/buildkite/service-agents-test.yaml @@ -0,0 +1,73 @@ +# Copyright 2021 Google LLC +# +# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://llvm.org/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: service-agents-test + namespace: buildkite +spec: + replicas: 1 + selector: + matchLabels: + app: agent-premerge-debian + template: + metadata: + labels: + app: agent-premerge-debian + spec: + containers: + - name: buildkite-premerge-debian + image: gcr.io/llvm-premerge-checks/buildkite-premerge-debian:latest + resources: + limits: + cpu: 2 + memory: 5Gi + requests: + cpu: 1.5 + memory: 5Gi + volumeMounts: + - name: github-ssh + mountPath: /mnt/ssh + env: + - name: BUILDKITE_AGENT_TOKEN + valueFrom: + secretKeyRef: + name: buildkite-agent-token + key: token + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: BUILDKITE_AGENT_TAGS + value: "queue=service-test,name=$(POD_NAME)" + - name: BUILDKITE_BUILD_PATH + value: "/var/lib/buildkite-agent/builds" + - name: CONDUIT_TOKEN + valueFrom: + secretKeyRef: + name: conduit-api-token + key: token + - name: BUILDKITE_API_TOKEN + valueFrom: + secretKeyRef: + name: buildkite-api-token-readonly + key: token + volumes: + - name: github-ssh + secret: + secretName: github-ssh + nodeSelector: + cloud.google.com/gke-nodepool: default-pool + terminationGracePeriodSeconds: 1200 \ No newline at end of file diff --git a/kubernetes/buildkite/service-agents.yaml b/kubernetes/buildkite/service-agents.yaml index 3ec1f26..7a94b17 100644 --- a/kubernetes/buildkite/service-agents.yaml +++ b/kubernetes/buildkite/service-agents.yaml @@ -34,7 +34,7 @@ spec: spec: containers: - name: buildkite-premerge-debian - image: gcr.io/llvm-premerge-checks/buildkite-premerge-debian + image: gcr.io/llvm-premerge-checks/buildkite-premerge-debian:stable resources: limits: cpu: 2 @@ -72,4 +72,5 @@ spec: secret: secretName: github-ssh nodeSelector: - cloud.google.com/gke-nodepool: default-pool \ No newline at end of file + cloud.google.com/gke-nodepool: default-pool + terminationGracePeriodSeconds: 1200 \ No newline at end of file