1
0
Fork 0
llvm-premerge-checks/scripts/metrics/repo_hist.py

462 lines
17 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# Copyright 2019 Google LLC
#
# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://llvm.org/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
2020-02-21 13:19:11 +01:00
# generate statistics on the llvm github repository
import csv
import datetime
import git
import re
import os
2020-03-18 14:18:07 +01:00
from typing import Dict, Optional, List, Set
2020-02-21 13:19:11 +01:00
import random
import string
REVISION_REGEX = re.compile(
2021-04-27 09:56:24 +02:00
r"^Differential Revision: https://reviews\.llvm\.org/(.*)$", re.MULTILINE
)
REVERT_REGEX = re.compile(r'^Revert "(.+)"')
2021-04-27 09:56:24 +02:00
REVERT_HASH_REGEX = re.compile("This reverts commit (\w+)", re.MULTILINE)
class MyCommit:
2021-04-27 09:56:24 +02:00
SALT = "".join(
random.choices(
string.ascii_lowercase + string.ascii_uppercase + string.digits, k=16
)
)
2020-02-21 13:19:11 +01:00
def __init__(self, commit: git.Commit):
2020-03-18 14:18:07 +01:00
self.commit = commit
self.chash = commit.hexsha # type: str
2020-02-21 13:19:11 +01:00
self.author = hash(commit.author.email + MyCommit.SALT) # type: int
2020-03-09 09:34:52 +01:00
self.author_domain = commit.author.email.rsplit("@")[-1].lower() # type: str
self.commiter = hash(commit.committer.email.lower() + MyCommit.SALT) # type:int
self.summary = commit.summary # type: str
self.date = datetime.datetime.fromtimestamp(
2021-04-27 09:56:24 +02:00
commit.committed_date
) # type: datetime.datetime
self.phab_revision = self._get_revision(commit) # type: Optional[str]
2021-04-27 09:56:24 +02:00
self.reverts = None # type: Optional[MyCommit]
self.reverted_by = None # type: Optional[MyCommit]
2020-03-19 09:42:52 +01:00
self._diff_index = None # type: Optional[git.DiffIndex]
@staticmethod
def _get_revision(commit: git.Commit) -> Optional[str]:
m = REVISION_REGEX.search(commit.message)
if m is None:
return None
return m.group(1)
@property
def day(self) -> datetime.date:
return self.date.date()
def reverts_summary(self) -> Optional[str]:
m = REVERT_REGEX.search(self.summary)
if m is None:
return None
return m.group(1)
def __str__(self):
return self.chash
2020-02-21 13:19:11 +01:00
@property
def was_reverted(self) -> bool:
return self.reverted_by is not None
@property
def was_reviewed(self) -> bool:
return self.phab_revision is not None
@property
def is_revert(self) -> bool:
return self.reverts is not None
@property
def week(self) -> str:
2021-04-27 09:56:24 +02:00
return "{}-w{:02d}".format(self.date.year, self.date.isocalendar()[1])
2020-02-21 13:19:11 +01:00
2020-03-19 09:42:52 +01:00
@property
def diff_index(self) -> git.DiffIndex:
# expensive operation, cache the results
if self._diff_index is None:
2021-04-27 09:56:24 +02:00
self._diff_index = self.commit.diff(
self.commit.parents[0], create_patch=True
)
2020-03-19 09:42:52 +01:00
return self._diff_index
2020-03-18 14:18:07 +01:00
@property
2020-03-19 09:42:52 +01:00
def num_loc(self) -> int:
nloc = 0
for diff in self.diff_index:
2021-04-27 09:56:24 +02:00
nloc += str(diff.diff, encoding="utf8").count("\n")
2020-03-19 09:42:52 +01:00
return nloc
2020-03-18 14:18:07 +01:00
2020-03-19 09:42:52 +01:00
@property
def modified_paths(self) -> Set[str]:
result = set(d.b_path for d in self.diff_index if d.b_path is not None)
result.update(d.a_path for d in self.diff_index if d.a_path is not None)
2020-03-18 14:18:07 +01:00
return result
2020-03-19 09:42:52 +01:00
@property
def modified_projects(self) -> Set[str]:
2021-04-27 09:56:24 +02:00
return set(p.split("/")[0] for p in self.modified_paths)
2020-03-19 09:42:52 +01:00
2021-04-27 09:56:24 +02:00
@property
def reverts_commit_hash(self):
m = REVERT_HASH_REGEX.search(self.commit.message)
if m is None:
# TODO: double check for "Reverts" in summary line for consistency
return None
return m.group(1)
2021-04-27 09:56:24 +02:00
class RepoStats:
2020-03-18 14:18:07 +01:00
def __init__(self, git_dir: str):
self.repo = git.Repo(git_dir)
self.commit_by_hash = dict() # type: Dict[str, MyCommit]
self.commit_by_summary = dict() # type: Dict[str, List[MyCommit]]
2020-02-21 13:19:11 +01:00
self.commit_by_week = dict() # type: Dict[str, List[MyCommit]]
self.commit_by_author = dict() # type: Dict[int, List[MyCommit]]
2020-03-19 09:42:52 +01:00
self.commit_by_author_domain = dict() # type: Dict[str, List[MyCommit]]
2021-04-27 09:56:24 +02:00
def parse_repo(self, maxage: datetime.datetime):
for commit in self.repo.iter_commits("main"):
if commit.committed_datetime < maxage:
break
mycommit = MyCommit(commit)
self.commit_by_hash[mycommit.chash] = mycommit
2021-04-27 09:56:24 +02:00
self.commit_by_summary.setdefault(mycommit.summary, []).append(mycommit)
2020-02-21 13:19:11 +01:00
self.commit_by_week.setdefault(mycommit.week, []).append(mycommit)
2021-04-27 09:56:24 +02:00
self.commit_by_author.setdefault(mycommit.author, []).append(mycommit)
self.commit_by_author_domain.setdefault(mycommit.author_domain, []).append(
mycommit
)
print("Read {} commits".format(len(self.commit_by_hash)))
def find_reverts(self):
reverts = 0
for commit in self.commit_by_hash.values():
summary = commit.reverts_summary()
if summary is None:
continue
if summary not in self.commit_by_summary:
2021-04-27 09:56:24 +02:00
print("summary not found: {}".format(summary))
continue
reverting_commit = self.commit_by_summary[summary][-1]
commit.reverted_by = reverting_commit
reverting_commit.reverts = commit
reverts += 1
2021-04-27 09:56:24 +02:00
print("Found {} reverts".format(reverts))
# https://stackoverflow.com/questions/2600775/how-to-get-week-number-in-python
def dump_daily_stats(self):
2021-04-27 09:56:24 +02:00
fieldnames = [
"week",
"num_commits",
"num_reverts",
"percentage_reverts",
"num_reviewed",
"percentage_reviewed",
"# reviewed & revert",
"# !reviewed & !revert",
"# !reviewed & revert",
"# reviewed & !revert",
]
csvfile = open("tmp/llvm-project-weekly.csv", "w")
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
writer.writeheader()
2020-02-21 13:19:11 +01:00
for week in sorted(self.commit_by_week.keys()):
commits = self.commit_by_week[week]
num_commits = len(commits)
2020-02-21 13:19:11 +01:00
num_reverts = len([c for c in commits if c.is_revert])
2021-04-27 09:56:24 +02:00
percentage_reverts = 100.0 * num_reverts / num_commits
2020-02-21 13:19:11 +01:00
num_reviewed = len([c for c in commits if c.was_reviewed])
2021-04-27 09:56:24 +02:00
percentage_reviewed = 100 * num_reviewed / (num_commits - num_reverts)
num_reviewed_revert = len(
[c for c in commits if c.was_reviewed and c.is_revert]
)
num_reviewed_nrevert = len(
[c for c in commits if c.was_reviewed and not c.is_revert]
)
num_nreviewed_nrevert = len(
[c for c in commits if not c.was_reviewed and not c.is_revert]
)
num_nreviewed_revert = len(
[c for c in commits if not c.was_reviewed and c.is_revert]
)
writer.writerow(
{
"week": week,
"num_commits": num_commits,
"num_reverts": num_reverts,
"percentage_reverts": percentage_reverts,
"num_reviewed": num_reviewed,
"percentage_reviewed": percentage_reviewed,
"# reviewed & revert": num_reviewed_revert,
"# !reviewed & !revert": num_nreviewed_nrevert,
"# !reviewed & revert": num_nreviewed_revert,
"# reviewed & !revert": num_reviewed_nrevert,
}
)
def dump_overall_stats(self):
num_commits = len(self.commit_by_hash)
2021-04-27 09:56:24 +02:00
num_reverts = len([c for c in self.commit_by_hash.values() if c.is_revert])
print("Number of commits: {}".format(num_commits))
print("Number of reverts: {}".format(num_reverts))
print("percentage of reverts: {:0.2f}".format(100 * num_reverts / num_commits))
num_reviewed = len([c for c in self.commit_by_hash.values() if c.was_reviewed])
print("Number of reviewed commits: {}".format(num_reviewed))
print(
"percentage of reviewed commits: {:0.2f}".format(
100 * num_reviewed / num_commits
)
)
num_reviewed_reverted = len(
[
c
for c in self.commit_by_hash.values()
if c.was_reviewed and c.was_reverted
]
)
num_not_reviewed_reverted = len(
[
c
for c in self.commit_by_hash.values()
if not c.was_reviewed and c.was_reverted
]
)
print("Number of reviewed that were reverted: {}".format(num_reviewed_reverted))
print(
"Number of NOT reviewed that were reverted: {}".format(
num_not_reviewed_reverted
)
)
print(
"percentage of reviewed that were reverted: {:0.2f}".format(
100 * num_reviewed_reverted / num_reviewed
)
)
print(
"percentage of NOT reviewed that were reverted: {:0.2f}".format(
100 * num_not_reviewed_reverted / (num_commits - num_reviewed)
)
)
num_foreign_committer = len(
[c for c in self.commit_by_hash.values() if c.author != c.commiter]
)
print(
"Number of commits where author != committer: {}".format(
num_foreign_committer
)
)
print(
"Percentage of commits where author != committer: {:0.2f}".format(
100 * num_foreign_committer / num_commits
)
)
2020-02-21 13:19:11 +01:00
def dump_author_stats(self):
2021-04-27 09:56:24 +02:00
print("Number of authors: {}".format(len(self.commit_by_author)))
fieldnames = [
"author",
"num_commits",
"num_reverts",
"percentage_reverts",
"num_reviewed",
"percentage_reviewed",
]
csvfile = open("tmp/llvm-project-authors.csv", "w")
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
2020-02-21 13:19:11 +01:00
writer.writeheader()
for author, commits in self.commit_by_author.items():
num_commits = len(commits)
num_reverts = len([c for c in commits if c.was_reverted])
percentage_reverts = 100 * num_reverts / num_commits
num_reviewed = len([c for c in commits if c.was_reviewed])
2021-04-27 09:56:24 +02:00
percentage_reviewed = 100 * num_reviewed / num_commits
writer.writerow(
{
"author": author,
"num_commits": num_commits,
"num_reverts": num_reverts,
"percentage_reverts": percentage_reverts,
"num_reviewed": num_reviewed,
"percentage_reviewed": percentage_reviewed,
}
)
2020-02-21 13:19:11 +01:00
2020-03-09 09:31:25 +01:00
def dump_author_domain_stats(self):
2021-04-27 09:56:24 +02:00
print("Number of authors: {}".format(len(self.commit_by_author)))
2020-03-09 09:31:25 +01:00
fieldnames = ["author_domain", "num_commits", "num_committers"]
2021-04-27 09:56:24 +02:00
csvfile = open("tmp/llvm-project-author_domains.csv", "w")
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
2020-03-09 09:31:25 +01:00
writer.writeheader()
for author_domain, commits in self.commit_by_author_domain.items():
num_commits = len(commits)
committers = set(c.author for c in commits)
2021-04-27 09:56:24 +02:00
writer.writerow(
{
"author_domain": author_domain,
"num_commits": num_commits,
"num_committers": len(committers),
}
)
2020-03-09 09:31:25 +01:00
2020-03-18 14:18:07 +01:00
def dump_unreviewed_paths(self, maxage: datetime.datetime):
# TODO: this is really slow. Maybe parallelize?
path_count = {
True: {},
False: {},
} # type: Dict[bool, Dict[str, int]]
2021-04-27 09:56:24 +02:00
for commit in self.repo.iter_commits("main"):
2020-03-18 14:18:07 +01:00
if commit.committed_datetime < maxage:
break
mycommit = MyCommit(commit)
2021-04-27 09:56:24 +02:00
for prefix in set(p.split("/")[0] for p in mycommit.modified_paths):
2020-03-18 14:18:07 +01:00
path_count[mycommit.was_reviewed].setdefault(prefix, 0)
path_count[mycommit.was_reviewed][prefix] += 1
2021-04-27 09:56:24 +02:00
fieldnames = ["was_reviewed"]
2020-03-18 14:18:07 +01:00
all_paths = set(path_count[True].keys())
all_paths.update(path_count[False].keys())
fieldnames.extend(sorted(all_paths))
2021-04-27 09:56:24 +02:00
csvfile = open("tmp/llvm-project-unreviewed-paths.csv", "w")
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
2020-03-18 14:18:07 +01:00
writer.writeheader()
for reviewed in [True, False]:
2021-04-27 09:56:24 +02:00
row = {"was_reviewed": reviewed}
2020-03-18 14:18:07 +01:00
for path, count in path_count[reviewed].items():
row[path] = count
writer.writerow(row)
csvfile.close()
2020-03-19 09:42:52 +01:00
def dump_loc_commits(self, maxage: datetime.datetime):
# TODO: this is really slow. Maybe parallelize?
buckets = list(range(0, 2001, 100))
review_dict = {
True: {b: 0 for b in buckets},
False: {b: 0 for b in buckets},
} # type: Dict[bool, Dict[int, int]]
reverted_dict = {
True: {b: 0 for b in buckets},
False: {b: 0 for b in buckets},
} # type: Dict[bool, Dict[int, int]]
2021-04-27 09:56:24 +02:00
for commit in self.repo.iter_commits("main"):
2020-03-19 09:42:52 +01:00
if commit.committed_datetime < maxage:
break
mycommit = self.commit_by_hash[commit.hexsha]
2021-04-27 09:56:24 +02:00
review_dict[mycommit.was_reviewed][
self._find_bucket(mycommit.num_loc, buckets)
] += 1
reverted_dict[mycommit.was_reverted][
self._find_bucket(mycommit.num_loc, buckets)
] += 1
fieldnames = ["was_reviewed"]
for i in range(0, len(buckets) - 1):
fieldnames.append("{}-{}".format(buckets[i], buckets[i + 1] - 1))
fieldnames.append(">={}".format(buckets[-1]))
csvfile = open("tmp/llvm-project-unreviewed-loc.csv", "w")
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
2020-03-19 09:42:52 +01:00
writer.writeheader()
for reviewed in [True, False]:
2021-04-27 09:56:24 +02:00
row = {"was_reviewed": reviewed}
2020-03-19 09:42:52 +01:00
for i in range(0, len(buckets)):
2021-04-27 09:56:24 +02:00
row[fieldnames[i + 1]] = review_dict[reviewed][buckets[i]]
2020-03-19 09:42:52 +01:00
writer.writerow(row)
2021-04-27 09:56:24 +02:00
writer.writerow({"was_reviewed": "reverted"})
2020-03-19 09:42:52 +01:00
for reverted in [True, False]:
2021-04-27 09:56:24 +02:00
row = {"was_reviewed": reverted}
2020-03-19 09:42:52 +01:00
for i in range(0, len(buckets)):
2021-04-27 09:56:24 +02:00
row[fieldnames[i + 1]] = reverted_dict[reverted][buckets[i]]
2020-03-19 09:42:52 +01:00
writer.writerow(row)
csvfile.close()
@staticmethod
def _find_bucket(number: int, buckets: List[int]) -> int:
for bucket in buckets:
if number < bucket:
return bucket
return buckets[-1]
def export_commits(self):
2021-04-27 09:56:24 +02:00
print("starting export...")
csvfile = open("tmp/llvm-project-export.csv", "w")
fieldnames = [
"timestamp",
"hash",
"reviewed",
"was_reverted",
"is_revert",
"# LOC changed",
"modified projects",
"author domain",
"revision",
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
2020-03-19 09:42:52 +01:00
writer.writeheader()
# did not work with multiprocessing.map, gave recursion error from gitpython...
# so using normal map function
for row in map(_create_row, self.commit_by_hash.values()):
writer.writerow(row)
csvfile.close()
def _create_row(mycommit: MyCommit) -> Dict:
try:
return {
2021-04-27 09:56:24 +02:00
"timestamp": mycommit.date.isoformat(),
"hash": mycommit.chash,
"reviewed": mycommit.was_reviewed,
"was_reverted": mycommit.was_reverted,
"is_revert": mycommit.is_revert,
"# LOC changed": mycommit.num_loc,
"modified projects": (";".join(mycommit.modified_projects)),
"author domain": mycommit.author_domain,
"revision": mycommit.phab_revision
if mycommit.phab_revision is not None
else "",
2020-03-19 09:42:52 +01:00
}
except Exception as e:
print(e)
return {}
2021-04-27 09:56:24 +02:00
if __name__ == "__main__":
max_age = datetime.datetime(
year=2019, month=10, day=1, tzinfo=datetime.timezone.utc
)
2020-03-19 09:42:52 +01:00
now = datetime.datetime.now(tz=datetime.timezone.utc)
2021-04-27 09:56:24 +02:00
rs = RepoStats(os.path.expanduser("~/git/llvm-project"))
2020-02-21 13:19:11 +01:00
# TODO: make the path configurable, and `git clone/pull`
2020-03-18 14:18:07 +01:00
rs.parse_repo(max_age)
rs.find_reverts()
rs.dump_daily_stats()
rs.dump_overall_stats()
2020-02-21 13:19:11 +01:00
rs.dump_author_stats()
2020-03-09 09:31:25 +01:00
rs.dump_author_domain_stats()
2020-03-18 14:18:07 +01:00
# disabled as it's quite slow
2020-03-19 09:42:52 +01:00
# rs.dump_unreviewed_paths(now - datetime.timedelta(days=100))
# rs.dump_loc_commits(now - datetime.timedelta(days=100))
rs.export_commits()
2021-04-27 09:56:24 +02:00
print("Done.")