1
0
Fork 0
llvm-premerge-checks/scripts/metrics/pull_phab_build_stats.py

454 lines
16 KiB
Python
Raw Normal View History

2020-08-07 16:55:50 +02:00
#!/usr/bin/env python3
# Copyright 2019 Google LLC
#
# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://llvm.org/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Get data on Revisions and builds from Phabricator
import phabricator
import json
import os
import datetime
2020-02-22 11:08:41 +01:00
from typing import Dict, List, Optional
import csv
2020-08-07 16:55:50 +02:00
import time
import socket
2020-08-12 13:42:14 +02:00
import git
import argparse
2020-02-22 11:08:41 +01:00
2020-08-07 16:55:50 +02:00
# PHIDs of build plans used for pre-merge testing
# FIXME: how do you get these?
2020-03-17 11:22:55 +01:00
_PRE_MERGE_PHIDs = ['PHID-HMCP-bfkbtacsszhg3feydpo6', # beta testers
2020-08-12 13:42:14 +02:00
'PHID-HMCP-qbviyekvgirhhhvkzpsn', # public pre-merge tests
'PHID-HMCP-p2oc4ocen3l2yzymvg2l',
2020-03-17 11:22:55 +01:00
]
2020-02-22 11:08:41 +01:00
2020-08-07 16:55:50 +02:00
# query all data after this date
START_DATE = datetime.date(year=2019, month=10, day=1)
2020-02-22 11:08:41 +01:00
class PhabResponse:
def __init__(self, revision_dict: Dict):
self.revision_dict = revision_dict
@property
def id(self) -> str:
return self.revision_dict['id']
@property
def phid(self) -> str:
return self.revision_dict['phid']
def __str__(self):
return str(self.revision_dict)
class Revision(PhabResponse):
def __init__(self, revision_dict):
super().__init__(revision_dict)
self.buildables = [] # type: List['Buildable']
2020-02-24 17:53:43 +01:00
self.diffs = [] # type: List['Diff']
2020-02-22 11:08:41 +01:00
@property
def status(self) -> str:
return self.revision_dict['fields']['status']['value']
@property
def builds(self) -> List['Build']:
builds = []
for b in self.buildables:
builds.extend(b.builds)
return builds
@property
def created_date(self):
return self.revision_dict['fields']['dateCreated']
@property
def was_premerge_tested(self) -> bool:
return any((b.was_premerge_tested for b in self.builds))
2020-02-24 17:53:43 +01:00
@property
def repository_phid(self) -> str:
return self.revision_dict['fields']['repositoryPHID']
@property
def diff_phid(self) -> str:
return self.revision_dict['fields']['diffPHID']
@property
def all_diffs_have_refs(self) -> bool:
return not any(not d.has_refs for d in self.diffs)
2020-03-17 11:22:55 +01:00
@property
def day(self) -> datetime.date:
return datetime.date.fromtimestamp(self.created_date)
@property
def week(self) -> str:
day = self.day
return'{}-w{:02d}'.format(day.year, day.isocalendar()[1])
2020-02-22 11:08:41 +01:00
class Buildable(PhabResponse):
def __init__(self, revision_dict):
super().__init__(revision_dict)
self.builds = [] # type: List[Build]
self.revision = None # type: Optional[Revision]
@property
def diff_phid(self) -> str:
return self.revision_dict['buildablePHID']
@property
def revison_phid(self) -> str:
return self.revision_dict['containerPHID']
class Build(PhabResponse):
def __init__(self, revision_dict):
super().__init__(revision_dict)
self.buildable = None # type: Optional[Buildable]
@property
def buildable_phid(self) -> str:
return self.revision_dict['fields']['buildablePHID']
@property
def buildplan_phid(self) -> str:
return self.revision_dict['fields']['buildPlanPHID']
@property
def was_premerge_tested(self) -> bool:
2020-08-07 16:55:50 +02:00
result = self.buildplan_phid in _PRE_MERGE_PHIDs
if not result:
2020-08-12 13:42:14 +02:00
print(self.buildplan_phid)
2020-08-07 16:55:50 +02:00
return result
2020-02-24 17:53:43 +01:00
class Diff(PhabResponse):
def __init__(self, revision_dict):
super().__init__(revision_dict)
self.revision = None # type: Optional[Revision]
@property
def revison_phid(self) -> str:
return self.revision_dict['fields']['revisionPHID']
2020-08-12 13:42:14 +02:00
@property
def _refs(self) -> List:
return self.revision_dict['fields']['refs']
2020-02-24 17:53:43 +01:00
@property
def has_refs(self) -> bool:
2020-08-12 13:42:14 +02:00
return len(self._refs) > 0
@property
def base_revision(self) -> str:
for ref in self._refs:
if ref['type'] == 'base':
return ref['identifier']
return None
@property
def base_branch(self) -> str:
for ref in self._refs:
if ref['type'] == 'branch':
return ref['name']
return None
2020-02-24 17:53:43 +01:00
2020-08-12 13:42:14 +02:00
@property
def dateCreated(self) -> datetime.datetime:
return datetime.datetime.fromtimestamp(self.revision_dict['fields']['dateCreated'])
2020-02-24 17:53:43 +01:00
class PhabBuildPuller:
2020-08-07 16:55:50 +02:00
# files/folder for sotring temporary results
_TMP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'tmp'))
_REVISION_FILE = os.path.join(_TMP_DIR, 'phab-revisions.json')
_BUILDABLE_FILE = os.path.join(_TMP_DIR, 'phab-buildables.json')
_BUILD_FILE = os.path.join(_TMP_DIR, 'phab-build.json')
_DIFF_FILE = os.path.join(_TMP_DIR, 'phab-diffs.json')
_PHAB_WEEKLY_METRICS_FILE = os.path.join(_TMP_DIR, 'phabricator_{}.csv')
2020-02-22 11:08:41 +01:00
2020-08-12 13:42:14 +02:00
def __init__(self, repo_path: str):
self.conduit_token = None
self.host = None
self.phab = self._create_phab()
2020-08-12 13:42:14 +02:00
self._repo_path = repo_path # type: str
2020-02-24 17:53:43 +01:00
self.revisions = {} # type: Dict[str, Revision]
self.buildables = {} # type: Dict[str, Buildable]
self.builds = {} # type: Dict[str, Build]
self.diffs = {} # type: Dict[str, Diff]
def _create_phab(self) -> phabricator.Phabricator:
phab = phabricator.Phabricator(token=self.conduit_token, host=self.host)
2020-02-24 17:53:43 +01:00
phab.update_interfaces()
return phab
def _load_arcrc(self):
"""Load arc configuration from file if not set."""
if self.conduit_token is not None or self.host is not None:
return
print('Loading configuration from ~/.arcrc file')
with open(os.path.expanduser('~/.arcrc'), 'r') as arcrc_file:
arcrc = json.load(arcrc_file)
# use the first host configured in the file
self.host = next(iter(arcrc['hosts']))
self.conduit_token = arcrc['hosts'][self.host]['token']
2020-02-22 11:08:41 +01:00
def run(self):
2020-08-07 16:55:50 +02:00
if not os.path.exists(self._TMP_DIR):
os.mkdir(self._TMP_DIR)
2020-02-22 11:08:41 +01:00
if not os.path.isfile(self._REVISION_FILE):
self.get_revisions()
self.parse_revisions()
if not os.path.isfile(self._BUILDABLE_FILE):
self.get_buildables()
self.parse_buildables()
if not os.path.isfile(self._BUILD_FILE):
self.get_builds()
self.parse_builds()
2020-02-24 17:53:43 +01:00
if not os.path.isfile(self._DIFF_FILE):
self.get_diffs()
self.parse_diffs()
2020-02-22 11:08:41 +01:00
self.link_objects()
2020-03-17 11:22:55 +01:00
self.compute_metrics('day', lambda r: r.day)
self.compute_metrics('week', lambda r: r.week)
2020-08-12 13:42:14 +02:00
self.count_base_revisions()
self.match_base_revisions_with_repo(self._repo_path)
def get_revisions(self):
2020-02-22 11:08:41 +01:00
print('Downloading revisions starting...')
2020-08-07 16:55:50 +02:00
from_date = int(START_DATE.strftime('%s'))
data = []
constraints = {
'createdStart': from_date
}
2020-08-12 13:42:14 +02:00
# FIXME: lots of code duplication around pagination and error handling.
# find a way to separate this into a function.
after = None
while True:
revisions = self.phab.differential.revision.search(
constraints=constraints, after=after)
data.extend(revisions.response['data'])
2020-02-22 11:08:41 +01:00
print('{} revisions...'.format(len(data)))
after = revisions.response['cursor']['after']
if after is None:
break
print('Number of revisions:', len(data))
2020-02-22 11:08:41 +01:00
with open(self._REVISION_FILE, 'w') as json_file:
json.dump(data, json_file)
2020-02-22 11:08:41 +01:00
def get_buildables(self):
print('Downloading buildables...')
data = []
after = None
while True:
revisions = self.phab.harbormaster.querybuildables(
2020-02-24 17:53:43 +01:00
containerPHIDs=[r.phid for r in self.revisions.values()], after=after)
2020-02-22 11:08:41 +01:00
data.extend(revisions.response['data'])
print('{} buildables...'.format(len(data)))
after = revisions.response['cursor']['after']
if after is None:
break
print('Number of buildables:', len(data))
with open(self._BUILDABLE_FILE, 'w') as json_file:
json.dump(data, json_file)
def get_builds(self):
print('Downloading builds...')
data = []
constraints = {
2020-02-24 17:53:43 +01:00
'buildables': [r.phid for r in self.buildables.values()]
2020-02-22 11:08:41 +01:00
}
after = None
while True:
2020-08-07 16:55:50 +02:00
# retry on timeouts
fail_count = 0
while True:
try:
revisions = self.phab.harbormaster.build.search(
constraints=constraints, after=after)
except socket.timeout:
fail_count +=1
if fail_count > 5:
raise
time.sleep(10)
continue
break
2020-02-22 11:08:41 +01:00
data.extend(revisions.response['data'])
print('{} builds...'.format(len(data)))
after = revisions.response['cursor']['after']
if after is None:
break
print('Number of buildables:', len(data))
with open(self._BUILD_FILE, 'w') as json_file:
json.dump(data, json_file)
2020-02-24 17:53:43 +01:00
def get_diffs(self):
print('Downloading diffs...')
data = []
constraints = {
'revisionPHIDs': [r.phid for r in self.revisions.values()]
}
after = None
while True:
2020-08-07 16:55:50 +02:00
# retry on timeouts
fail_count = 0
while True:
try:
diffs = self.phab.differential.diff.search(
constraints=constraints, after=after)
except socket.timeout:
fail_count +=1
if fail_count > 5:
raise
time.sleep(10)
continue
break
2020-02-24 17:53:43 +01:00
data.extend(diffs.response['data'])
print('{} diffs...'.format(len(data)))
after = diffs.response['cursor']['after']
if after is None:
break
print('Number of diffs:', len(data))
with open(self._DIFF_FILE, 'w') as json_file:
json.dump(data, json_file)
2020-02-22 11:08:41 +01:00
def parse_revisions(self):
with open(self._REVISION_FILE) as revision_file:
revision_dict = json.load(revision_file)
self.revisions = {r.phid: r for r in (Revision(x) for x in revision_dict)}
print('Parsed {} revisions.'.format(len(self.revisions)))
def parse_buildables(self):
with open(self._BUILDABLE_FILE) as buildables_file:
buildable_dict = json.load(buildables_file)
self.buildables = {b.phid: b for b in (Buildable(x) for x in buildable_dict)}
print('Parsed {} buildables.'.format(len(self.buildables)))
def parse_builds(self):
with open(self._BUILD_FILE) as build_file:
build_dict = json.load(build_file)
self.builds = {b.phid: b for b in (Build(x) for x in build_dict)}
print('Parsed {} builds.'.format(len(self.builds)))
2020-02-24 17:53:43 +01:00
def parse_diffs(self):
with open(self._DIFF_FILE) as diff_file:
diff_dict = json.load(diff_file)
self.diffs = {d.phid: d for d in (Diff(x) for x in diff_dict)}
print('Parsed {} diffs.'.format(len(self.diffs)))
2020-02-22 11:08:41 +01:00
def link_objects(self):
for build in (b for b in self.builds.values()):
buildable = self.buildables[build.buildable_phid]
build.buildable = buildable
buildable.builds.append(build)
for buildable in self.buildables.values():
revision = self.revisions[buildable.revison_phid]
revision.buildables.append(buildable)
buildable.revision = revision
2020-02-24 17:53:43 +01:00
for diff in self.diffs.values():
revision = self.revisions[diff.revison_phid]
revision.diffs.append(diff)
diff.revision = revision
2020-03-17 11:22:55 +01:00
def compute_metrics(self, name: str, group_function):
print('Creating metrics for {}...'.format(name))
group_dict = {}
2020-02-22 11:08:41 +01:00
for revision in self.revisions.values():
2020-03-17 11:22:55 +01:00
group_dict.setdefault(group_function(revision), []).append(revision)
2020-02-22 11:08:41 +01:00
2020-03-17 11:22:55 +01:00
csv_file = open(self._PHAB_WEEKLY_METRICS_FILE.format(name), 'w')
fieldnames = [name, '# revisions', '# tested revisions', '% tested revisions', '# untested revisions',
2020-02-24 17:53:43 +01:00
'# revisions without builds', '% revisions without builds', '# no repository set']
2020-02-22 11:08:41 +01:00
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=csv.excel)
writer.writeheader()
2020-03-17 11:22:55 +01:00
for group in sorted(group_dict.keys()):
revisions = group_dict[group] # type: List[Revision]
2020-02-22 11:08:41 +01:00
num_revisions = len(revisions)
num_premt_revisions = len([r for r in revisions if r.was_premerge_tested])
precentage_premt_revisions = 100.0 * num_premt_revisions / num_revisions
2020-02-24 17:53:43 +01:00
num_no_build_triggered = len([r for r in revisions if len(r.builds) == 0])
percent_no_build_triggered = 100.0 * num_no_build_triggered / num_revisions
num_no_repo = len([r for r in revisions if r.repository_phid is None])
2020-02-22 11:08:41 +01:00
writer.writerow({
2020-03-17 11:22:55 +01:00
name: group,
2020-02-24 17:53:43 +01:00
'# revisions': num_revisions,
'# tested revisions': num_premt_revisions,
'% tested revisions': precentage_premt_revisions,
'# untested revisions': num_revisions - num_premt_revisions,
'# revisions without builds': num_no_build_triggered,
'% revisions without builds': percent_no_build_triggered,
'# no repository set': num_no_repo,
2020-02-22 11:08:41 +01:00
})
2020-08-12 13:42:14 +02:00
def count_base_revisions(self):
base_revisions = {}
base_branches = {}
for diff in self.diffs.values():
base_revisions.setdefault(diff.base_revision, 0)
base_revisions[diff.base_revision] += 1
base_branches.setdefault(diff.base_branch, 0)
base_branches[diff.base_branch] +=1
print(f'{len(self.diffs)} diffs are using {len(base_revisions)} different git base revisions.')
print('The top 10 revisions and their usages are:')
revisions = sorted( base_revisions.items(), key=lambda x: x[1] , reverse=True)
for i in revisions[:10]:
print(f' commit {i[0]} was used {i[1]} times')
print()
print(f'{len(self.diffs)} diffs are using {len(base_branches)} different git base branches')
branches = sorted( base_branches.items(), key=lambda x: x[1] , reverse=True)
print('The top 10 branches and their usages are:')
for i in branches[:10]:
print(f' branch {i[0]} was used {i[1]} times')
print()
def match_base_revisions_with_repo(self, repo_path: str):
repo = git.Repo(repo_path)
not_found = 0
invalid_date = 0
has_base_revision = 0
for diff in self.diffs.values():
revision = diff.base_revision
if revision is None:
continue
has_base_revision += 1
try:
commit = repo.commit(revision)
except (ValueError, git.BadName):
not_found += 1
continue
commited_date = datetime.datetime.fromtimestamp(commit.committed_date)
if commited_date > diff.dateCreated:
invalid_date += 1
print(f'Of the {has_base_revision} Diffs with base revision, the base revision was NOT found in the repo for {not_found} and ')
print(f'{invalid_date} base revisions were used before being available upstream.')
print(f'So {(not_found+invalid_date)/has_base_revision*100:0.2f} % of specified the base revisions were unusable.')
if __name__ == '__main__':
2020-08-12 13:42:14 +02:00
parser = argparse.ArgumentParser()
parser.add_argument('repo_path')
args = parser.parse_args()
puller = PhabBuildPuller(args.repo_path)
2020-02-22 11:08:41 +01:00
puller.run()