first query on DB working
This commit is contained in:
parent
08bb6492fa
commit
06c97f1dc5
2 changed files with 356 additions and 141 deletions
360
scripts/metrics/repo_hist.py
Normal file → Executable file
360
scripts/metrics/repo_hist.py
Normal file → Executable file
|
@ -25,15 +25,19 @@ import random
|
||||||
import string
|
import string
|
||||||
|
|
||||||
REVISION_REGEX = re.compile(
|
REVISION_REGEX = re.compile(
|
||||||
r'^Differential Revision: https://reviews\.llvm\.org/(.*)$',
|
r"^Differential Revision: https://reviews\.llvm\.org/(.*)$", re.MULTILINE
|
||||||
re.MULTILINE)
|
)
|
||||||
REVERT_REGEX = re.compile(r'^Revert "(.+)"')
|
REVERT_REGEX = re.compile(r'^Revert "(.+)"')
|
||||||
|
REVERT_HASH_REGEX = re.compile("This reverts commit (\w+)", re.MULTILINE)
|
||||||
|
|
||||||
|
|
||||||
class MyCommit:
|
class MyCommit:
|
||||||
|
|
||||||
SALT = ''.join(random.choices(
|
SALT = "".join(
|
||||||
string.ascii_lowercase + string.ascii_uppercase + string.digits, k=16))
|
random.choices(
|
||||||
|
string.ascii_lowercase + string.ascii_uppercase + string.digits, k=16
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def __init__(self, commit: git.Commit):
|
def __init__(self, commit: git.Commit):
|
||||||
self.commit = commit
|
self.commit = commit
|
||||||
|
@ -43,10 +47,11 @@ class MyCommit:
|
||||||
self.commiter = hash(commit.committer.email.lower() + MyCommit.SALT) # type:int
|
self.commiter = hash(commit.committer.email.lower() + MyCommit.SALT) # type:int
|
||||||
self.summary = commit.summary # type: str
|
self.summary = commit.summary # type: str
|
||||||
self.date = datetime.datetime.fromtimestamp(
|
self.date = datetime.datetime.fromtimestamp(
|
||||||
commit.committed_date) # type: datetime.datetime
|
commit.committed_date
|
||||||
|
) # type: datetime.datetime
|
||||||
self.phab_revision = self._get_revision(commit) # type: Optional[str]
|
self.phab_revision = self._get_revision(commit) # type: Optional[str]
|
||||||
self.reverts = None # type: Optional[MyCommit]
|
self.reverts = None # type: Optional[MyCommit]
|
||||||
self.reverted_by = None # type: Optional[MyCommit]
|
self.reverted_by = None # type: Optional[MyCommit]
|
||||||
self._diff_index = None # type: Optional[git.DiffIndex]
|
self._diff_index = None # type: Optional[git.DiffIndex]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -83,20 +88,22 @@ class MyCommit:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def week(self) -> str:
|
def week(self) -> str:
|
||||||
return '{}-w{:02d}'.format(self.date.year, self.date.isocalendar()[1])
|
return "{}-w{:02d}".format(self.date.year, self.date.isocalendar()[1])
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def diff_index(self) -> git.DiffIndex:
|
def diff_index(self) -> git.DiffIndex:
|
||||||
# expensive operation, cache the results
|
# expensive operation, cache the results
|
||||||
if self._diff_index is None:
|
if self._diff_index is None:
|
||||||
self._diff_index = self.commit.diff(self.commit.parents[0], create_patch=True)
|
self._diff_index = self.commit.diff(
|
||||||
|
self.commit.parents[0], create_patch=True
|
||||||
|
)
|
||||||
return self._diff_index
|
return self._diff_index
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def num_loc(self) -> int:
|
def num_loc(self) -> int:
|
||||||
nloc = 0
|
nloc = 0
|
||||||
for diff in self.diff_index:
|
for diff in self.diff_index:
|
||||||
nloc += str(diff.diff, encoding='utf8').count('\n')
|
nloc += str(diff.diff, encoding="utf8").count("\n")
|
||||||
return nloc
|
return nloc
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -107,11 +114,18 @@ class MyCommit:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def modified_projects(self) -> Set[str]:
|
def modified_projects(self) -> Set[str]:
|
||||||
return set(p.split('/')[0] for p in self.modified_paths)
|
return set(p.split("/")[0] for p in self.modified_paths)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def reverts_commit_hash(self):
|
||||||
|
m = REVERT_HASH_REGEX.search(self.commit.message)
|
||||||
|
if m is None:
|
||||||
|
# TODO: double check for "Reverts" in summary line for consistency
|
||||||
|
return None
|
||||||
|
return m.group(1)
|
||||||
|
|
||||||
|
|
||||||
class RepoStats:
|
class RepoStats:
|
||||||
|
|
||||||
def __init__(self, git_dir: str):
|
def __init__(self, git_dir: str):
|
||||||
self.repo = git.Repo(git_dir)
|
self.repo = git.Repo(git_dir)
|
||||||
self.commit_by_hash = dict() # type: Dict[str, MyCommit]
|
self.commit_by_hash = dict() # type: Dict[str, MyCommit]
|
||||||
|
@ -120,20 +134,19 @@ class RepoStats:
|
||||||
self.commit_by_author = dict() # type: Dict[int, List[MyCommit]]
|
self.commit_by_author = dict() # type: Dict[int, List[MyCommit]]
|
||||||
self.commit_by_author_domain = dict() # type: Dict[str, List[MyCommit]]
|
self.commit_by_author_domain = dict() # type: Dict[str, List[MyCommit]]
|
||||||
|
|
||||||
def parse_repo(self, maxage: datetime.datetime):
|
def parse_repo(self, maxage: datetime.datetime):
|
||||||
for commit in self.repo.iter_commits('main'):
|
for commit in self.repo.iter_commits("main"):
|
||||||
if commit.committed_datetime < maxage:
|
if commit.committed_datetime < maxage:
|
||||||
break
|
break
|
||||||
mycommit = MyCommit(commit)
|
mycommit = MyCommit(commit)
|
||||||
self.commit_by_hash[mycommit.chash] = mycommit
|
self.commit_by_hash[mycommit.chash] = mycommit
|
||||||
self.commit_by_summary.setdefault(mycommit.summary, [])\
|
self.commit_by_summary.setdefault(mycommit.summary, []).append(mycommit)
|
||||||
.append(mycommit)
|
|
||||||
self.commit_by_week.setdefault(mycommit.week, []).append(mycommit)
|
self.commit_by_week.setdefault(mycommit.week, []).append(mycommit)
|
||||||
self.commit_by_author.setdefault(mycommit.author, [])\
|
self.commit_by_author.setdefault(mycommit.author, []).append(mycommit)
|
||||||
.append(mycommit)
|
self.commit_by_author_domain.setdefault(mycommit.author_domain, []).append(
|
||||||
self.commit_by_author_domain.setdefault(mycommit.author_domain, []) \
|
mycommit
|
||||||
.append(mycommit)
|
)
|
||||||
print('Read {} commits'.format(len(self.commit_by_hash)))
|
print("Read {} commits".format(len(self.commit_by_hash)))
|
||||||
|
|
||||||
def find_reverts(self):
|
def find_reverts(self):
|
||||||
reverts = 0
|
reverts = 0
|
||||||
|
@ -142,119 +155,171 @@ class RepoStats:
|
||||||
if summary is None:
|
if summary is None:
|
||||||
continue
|
continue
|
||||||
if summary not in self.commit_by_summary:
|
if summary not in self.commit_by_summary:
|
||||||
print('summary not found: {}'.format(summary))
|
print("summary not found: {}".format(summary))
|
||||||
continue
|
continue
|
||||||
reverting_commit = self.commit_by_summary[summary][-1]
|
reverting_commit = self.commit_by_summary[summary][-1]
|
||||||
commit.reverted_by = reverting_commit
|
commit.reverted_by = reverting_commit
|
||||||
reverting_commit.reverts = commit
|
reverting_commit.reverts = commit
|
||||||
reverts += 1
|
reverts += 1
|
||||||
print('Found {} reverts'.format(reverts))
|
print("Found {} reverts".format(reverts))
|
||||||
|
|
||||||
# https://stackoverflow.com/questions/2600775/how-to-get-week-number-in-python
|
# https://stackoverflow.com/questions/2600775/how-to-get-week-number-in-python
|
||||||
def dump_daily_stats(self):
|
def dump_daily_stats(self):
|
||||||
fieldnames = ["week", "num_commits", "num_reverts", "percentage_reverts",
|
fieldnames = [
|
||||||
"num_reviewed", "percentage_reviewed",
|
"week",
|
||||||
"# reviewed & revert", "# !reviewed & !revert", "# !reviewed & revert", "# reviewed & !revert"]
|
"num_commits",
|
||||||
csvfile = open('tmp/llvm-project-weekly.csv', 'w')
|
"num_reverts",
|
||||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames,
|
"percentage_reverts",
|
||||||
dialect=csv.excel)
|
"num_reviewed",
|
||||||
|
"percentage_reviewed",
|
||||||
|
"# reviewed & revert",
|
||||||
|
"# !reviewed & !revert",
|
||||||
|
"# !reviewed & revert",
|
||||||
|
"# reviewed & !revert",
|
||||||
|
]
|
||||||
|
csvfile = open("tmp/llvm-project-weekly.csv", "w")
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
|
||||||
writer.writeheader()
|
writer.writeheader()
|
||||||
for week in sorted(self.commit_by_week.keys()):
|
for week in sorted(self.commit_by_week.keys()):
|
||||||
commits = self.commit_by_week[week]
|
commits = self.commit_by_week[week]
|
||||||
num_commits = len(commits)
|
num_commits = len(commits)
|
||||||
num_reverts = len([c for c in commits if c.is_revert])
|
num_reverts = len([c for c in commits if c.is_revert])
|
||||||
percentage_reverts = 100.0*num_reverts / num_commits
|
percentage_reverts = 100.0 * num_reverts / num_commits
|
||||||
num_reviewed = len([c for c in commits if c.was_reviewed])
|
num_reviewed = len([c for c in commits if c.was_reviewed])
|
||||||
percentage_reviewed = 100*num_reviewed / (num_commits - num_reverts)
|
percentage_reviewed = 100 * num_reviewed / (num_commits - num_reverts)
|
||||||
num_reviewed_revert = len([c for c in commits if c.was_reviewed and c.is_revert])
|
num_reviewed_revert = len(
|
||||||
num_reviewed_nrevert = len([c for c in commits if c.was_reviewed and not c.is_revert])
|
[c for c in commits if c.was_reviewed and c.is_revert]
|
||||||
num_nreviewed_nrevert = len([c for c in commits if not c.was_reviewed and not c.is_revert])
|
)
|
||||||
num_nreviewed_revert = len([c for c in commits if not c.was_reviewed and c.is_revert])
|
num_reviewed_nrevert = len(
|
||||||
writer.writerow({
|
[c for c in commits if c.was_reviewed and not c.is_revert]
|
||||||
"week": week,
|
)
|
||||||
"num_commits": num_commits,
|
num_nreviewed_nrevert = len(
|
||||||
"num_reverts": num_reverts,
|
[c for c in commits if not c.was_reviewed and not c.is_revert]
|
||||||
"percentage_reverts": percentage_reverts,
|
)
|
||||||
"num_reviewed": num_reviewed,
|
num_nreviewed_revert = len(
|
||||||
"percentage_reviewed": percentage_reviewed,
|
[c for c in commits if not c.was_reviewed and c.is_revert]
|
||||||
"# reviewed & revert": num_reviewed_revert,
|
)
|
||||||
"# !reviewed & !revert": num_nreviewed_nrevert,
|
writer.writerow(
|
||||||
"# !reviewed & revert": num_nreviewed_revert,
|
{
|
||||||
"# reviewed & !revert": num_reviewed_nrevert,
|
"week": week,
|
||||||
})
|
"num_commits": num_commits,
|
||||||
|
"num_reverts": num_reverts,
|
||||||
|
"percentage_reverts": percentage_reverts,
|
||||||
|
"num_reviewed": num_reviewed,
|
||||||
|
"percentage_reviewed": percentage_reviewed,
|
||||||
|
"# reviewed & revert": num_reviewed_revert,
|
||||||
|
"# !reviewed & !revert": num_nreviewed_nrevert,
|
||||||
|
"# !reviewed & revert": num_nreviewed_revert,
|
||||||
|
"# reviewed & !revert": num_reviewed_nrevert,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
def dump_overall_stats(self):
|
def dump_overall_stats(self):
|
||||||
num_commits = len(self.commit_by_hash)
|
num_commits = len(self.commit_by_hash)
|
||||||
num_reverts = len([c for c in self.commit_by_hash.values()
|
num_reverts = len([c for c in self.commit_by_hash.values() if c.is_revert])
|
||||||
if c.is_revert])
|
print("Number of commits: {}".format(num_commits))
|
||||||
print('Number of commits: {}'.format(num_commits))
|
print("Number of reverts: {}".format(num_reverts))
|
||||||
print('Number of reverts: {}'.format(num_reverts))
|
print("percentage of reverts: {:0.2f}".format(100 * num_reverts / num_commits))
|
||||||
print('percentage of reverts: {:0.2f}'.format(
|
|
||||||
100*num_reverts / num_commits))
|
|
||||||
|
|
||||||
num_reviewed = len([c for c in self.commit_by_hash.values()
|
num_reviewed = len([c for c in self.commit_by_hash.values() if c.was_reviewed])
|
||||||
if c.was_reviewed])
|
print("Number of reviewed commits: {}".format(num_reviewed))
|
||||||
print('Number of reviewed commits: {}'.format(num_reviewed))
|
print(
|
||||||
print('percentage of reviewed commits: {:0.2f}'.format(
|
"percentage of reviewed commits: {:0.2f}".format(
|
||||||
100*num_reviewed / num_commits))
|
100 * num_reviewed / num_commits
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
num_reviewed_reverted = len([c for c in self.commit_by_hash.values()
|
num_reviewed_reverted = len(
|
||||||
if c.was_reviewed and c.was_reverted])
|
[
|
||||||
num_not_reviewed_reverted = len([c for c in self.commit_by_hash.values()
|
c
|
||||||
if not c.was_reviewed and
|
for c in self.commit_by_hash.values()
|
||||||
c.was_reverted])
|
if c.was_reviewed and c.was_reverted
|
||||||
print('Number of reviewed that were reverted: {}'.format(num_reviewed_reverted))
|
]
|
||||||
print('Number of NOT reviewed that were reverted: {}'.format(num_not_reviewed_reverted))
|
)
|
||||||
print('percentage of reviewed that were reverted: {:0.2f}'.format(
|
num_not_reviewed_reverted = len(
|
||||||
100*num_reviewed_reverted / num_reviewed))
|
[
|
||||||
print('percentage of NOT reviewed that were reverted: {:0.2f}'.format(
|
c
|
||||||
100*num_not_reviewed_reverted / (num_commits-num_reviewed)))
|
for c in self.commit_by_hash.values()
|
||||||
|
if not c.was_reviewed and c.was_reverted
|
||||||
|
]
|
||||||
|
)
|
||||||
|
print("Number of reviewed that were reverted: {}".format(num_reviewed_reverted))
|
||||||
|
print(
|
||||||
|
"Number of NOT reviewed that were reverted: {}".format(
|
||||||
|
num_not_reviewed_reverted
|
||||||
|
)
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
"percentage of reviewed that were reverted: {:0.2f}".format(
|
||||||
|
100 * num_reviewed_reverted / num_reviewed
|
||||||
|
)
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
"percentage of NOT reviewed that were reverted: {:0.2f}".format(
|
||||||
|
100 * num_not_reviewed_reverted / (num_commits - num_reviewed)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
num_foreign_committer = len([c for c in self.commit_by_hash.values()
|
num_foreign_committer = len(
|
||||||
if c.author != c.commiter])
|
[c for c in self.commit_by_hash.values() if c.author != c.commiter]
|
||||||
print('Number of commits where author != committer: {}'.format(
|
)
|
||||||
num_foreign_committer))
|
print(
|
||||||
print('Percentage of commits where author != committer: {:0.2f}'.format(
|
"Number of commits where author != committer: {}".format(
|
||||||
100*num_foreign_committer/num_commits))
|
num_foreign_committer
|
||||||
|
)
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
"Percentage of commits where author != committer: {:0.2f}".format(
|
||||||
|
100 * num_foreign_committer / num_commits
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def dump_author_stats(self):
|
def dump_author_stats(self):
|
||||||
print('Number of authors: {}'.format(len(self.commit_by_author)))
|
print("Number of authors: {}".format(len(self.commit_by_author)))
|
||||||
fieldnames = ["author", "num_commits", "num_reverts", "percentage_reverts",
|
fieldnames = [
|
||||||
"num_reviewed", "percentage_reviewed"]
|
"author",
|
||||||
csvfile = open('tmp/llvm-project-authors.csv', 'w')
|
"num_commits",
|
||||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames,
|
"num_reverts",
|
||||||
dialect=csv.excel)
|
"percentage_reverts",
|
||||||
|
"num_reviewed",
|
||||||
|
"percentage_reviewed",
|
||||||
|
]
|
||||||
|
csvfile = open("tmp/llvm-project-authors.csv", "w")
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
|
||||||
writer.writeheader()
|
writer.writeheader()
|
||||||
for author, commits in self.commit_by_author.items():
|
for author, commits in self.commit_by_author.items():
|
||||||
num_commits = len(commits)
|
num_commits = len(commits)
|
||||||
num_reverts = len([c for c in commits if c.was_reverted])
|
num_reverts = len([c for c in commits if c.was_reverted])
|
||||||
percentage_reverts = 100 * num_reverts / num_commits
|
percentage_reverts = 100 * num_reverts / num_commits
|
||||||
num_reviewed = len([c for c in commits if c.was_reviewed])
|
num_reviewed = len([c for c in commits if c.was_reviewed])
|
||||||
percentage_reviewed = 100*num_reviewed / num_commits
|
percentage_reviewed = 100 * num_reviewed / num_commits
|
||||||
writer.writerow({
|
writer.writerow(
|
||||||
"author": author,
|
{
|
||||||
"num_commits": num_commits,
|
"author": author,
|
||||||
"num_reverts": num_reverts,
|
"num_commits": num_commits,
|
||||||
"percentage_reverts": percentage_reverts,
|
"num_reverts": num_reverts,
|
||||||
"num_reviewed": num_reviewed,
|
"percentage_reverts": percentage_reverts,
|
||||||
"percentage_reviewed": percentage_reviewed,
|
"num_reviewed": num_reviewed,
|
||||||
})
|
"percentage_reviewed": percentage_reviewed,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
def dump_author_domain_stats(self):
|
def dump_author_domain_stats(self):
|
||||||
print('Number of authors: {}'.format(len(self.commit_by_author)))
|
print("Number of authors: {}".format(len(self.commit_by_author)))
|
||||||
fieldnames = ["author_domain", "num_commits", "num_committers"]
|
fieldnames = ["author_domain", "num_commits", "num_committers"]
|
||||||
csvfile = open('tmp/llvm-project-author_domains.csv', 'w')
|
csvfile = open("tmp/llvm-project-author_domains.csv", "w")
|
||||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames,
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
|
||||||
dialect=csv.excel)
|
|
||||||
writer.writeheader()
|
writer.writeheader()
|
||||||
for author_domain, commits in self.commit_by_author_domain.items():
|
for author_domain, commits in self.commit_by_author_domain.items():
|
||||||
num_commits = len(commits)
|
num_commits = len(commits)
|
||||||
committers = set(c.author for c in commits)
|
committers = set(c.author for c in commits)
|
||||||
writer.writerow({
|
writer.writerow(
|
||||||
"author_domain": author_domain,
|
{
|
||||||
"num_commits": num_commits,
|
"author_domain": author_domain,
|
||||||
"num_committers": len(committers),
|
"num_commits": num_commits,
|
||||||
})
|
"num_committers": len(committers),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
def dump_unreviewed_paths(self, maxage: datetime.datetime):
|
def dump_unreviewed_paths(self, maxage: datetime.datetime):
|
||||||
# TODO: this is really slow. Maybe parallelize?
|
# TODO: this is really slow. Maybe parallelize?
|
||||||
|
@ -262,23 +327,22 @@ class RepoStats:
|
||||||
True: {},
|
True: {},
|
||||||
False: {},
|
False: {},
|
||||||
} # type: Dict[bool, Dict[str, int]]
|
} # type: Dict[bool, Dict[str, int]]
|
||||||
for commit in self.repo.iter_commits('main'):
|
for commit in self.repo.iter_commits("main"):
|
||||||
if commit.committed_datetime < maxage:
|
if commit.committed_datetime < maxage:
|
||||||
break
|
break
|
||||||
mycommit = MyCommit(commit)
|
mycommit = MyCommit(commit)
|
||||||
for prefix in set(p.split('/')[0] for p in mycommit.modified_paths):
|
for prefix in set(p.split("/")[0] for p in mycommit.modified_paths):
|
||||||
path_count[mycommit.was_reviewed].setdefault(prefix, 0)
|
path_count[mycommit.was_reviewed].setdefault(prefix, 0)
|
||||||
path_count[mycommit.was_reviewed][prefix] += 1
|
path_count[mycommit.was_reviewed][prefix] += 1
|
||||||
fieldnames = ['was_reviewed']
|
fieldnames = ["was_reviewed"]
|
||||||
all_paths = set(path_count[True].keys())
|
all_paths = set(path_count[True].keys())
|
||||||
all_paths.update(path_count[False].keys())
|
all_paths.update(path_count[False].keys())
|
||||||
fieldnames.extend(sorted(all_paths))
|
fieldnames.extend(sorted(all_paths))
|
||||||
csvfile = open('tmp/llvm-project-unreviewed-paths.csv', 'w')
|
csvfile = open("tmp/llvm-project-unreviewed-paths.csv", "w")
|
||||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames,
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
|
||||||
dialect=csv.excel)
|
|
||||||
writer.writeheader()
|
writer.writeheader()
|
||||||
for reviewed in [True, False]:
|
for reviewed in [True, False]:
|
||||||
row = {'was_reviewed': reviewed}
|
row = {"was_reviewed": reviewed}
|
||||||
for path, count in path_count[reviewed].items():
|
for path, count in path_count[reviewed].items():
|
||||||
row[path] = count
|
row[path] = count
|
||||||
writer.writerow(row)
|
writer.writerow(row)
|
||||||
|
@ -295,31 +359,34 @@ class RepoStats:
|
||||||
True: {b: 0 for b in buckets},
|
True: {b: 0 for b in buckets},
|
||||||
False: {b: 0 for b in buckets},
|
False: {b: 0 for b in buckets},
|
||||||
} # type: Dict[bool, Dict[int, int]]
|
} # type: Dict[bool, Dict[int, int]]
|
||||||
for commit in self.repo.iter_commits('main'):
|
for commit in self.repo.iter_commits("main"):
|
||||||
if commit.committed_datetime < maxage:
|
if commit.committed_datetime < maxage:
|
||||||
break
|
break
|
||||||
mycommit = self.commit_by_hash[commit.hexsha]
|
mycommit = self.commit_by_hash[commit.hexsha]
|
||||||
review_dict[mycommit.was_reviewed][self._find_bucket(mycommit.num_loc, buckets)] += 1
|
review_dict[mycommit.was_reviewed][
|
||||||
reverted_dict[mycommit.was_reverted][self._find_bucket(mycommit.num_loc, buckets)] += 1
|
self._find_bucket(mycommit.num_loc, buckets)
|
||||||
fieldnames = ['was_reviewed']
|
] += 1
|
||||||
for i in range(0, len(buckets)-1):
|
reverted_dict[mycommit.was_reverted][
|
||||||
fieldnames.append('{}-{}'.format(buckets[i], buckets[i+1]-1))
|
self._find_bucket(mycommit.num_loc, buckets)
|
||||||
fieldnames.append('>={}'.format(buckets[-1]))
|
] += 1
|
||||||
csvfile = open('tmp/llvm-project-unreviewed-loc.csv', 'w')
|
fieldnames = ["was_reviewed"]
|
||||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames,
|
for i in range(0, len(buckets) - 1):
|
||||||
dialect=csv.excel)
|
fieldnames.append("{}-{}".format(buckets[i], buckets[i + 1] - 1))
|
||||||
|
fieldnames.append(">={}".format(buckets[-1]))
|
||||||
|
csvfile = open("tmp/llvm-project-unreviewed-loc.csv", "w")
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
|
||||||
writer.writeheader()
|
writer.writeheader()
|
||||||
for reviewed in [True, False]:
|
for reviewed in [True, False]:
|
||||||
row = {'was_reviewed': reviewed}
|
row = {"was_reviewed": reviewed}
|
||||||
for i in range(0, len(buckets)):
|
for i in range(0, len(buckets)):
|
||||||
row[fieldnames[i+1]] = review_dict[reviewed][buckets[i]]
|
row[fieldnames[i + 1]] = review_dict[reviewed][buckets[i]]
|
||||||
writer.writerow(row)
|
writer.writerow(row)
|
||||||
|
|
||||||
writer.writerow({'was_reviewed': 'reverted'})
|
writer.writerow({"was_reviewed": "reverted"})
|
||||||
for reverted in [True, False]:
|
for reverted in [True, False]:
|
||||||
row = {'was_reviewed': reverted}
|
row = {"was_reviewed": reverted}
|
||||||
for i in range(0, len(buckets)):
|
for i in range(0, len(buckets)):
|
||||||
row[fieldnames[i+1]] = reverted_dict[reverted][buckets[i]]
|
row[fieldnames[i + 1]] = reverted_dict[reverted][buckets[i]]
|
||||||
writer.writerow(row)
|
writer.writerow(row)
|
||||||
csvfile.close()
|
csvfile.close()
|
||||||
|
|
||||||
|
@ -332,12 +399,20 @@ class RepoStats:
|
||||||
|
|
||||||
def export_commits(self):
|
def export_commits(self):
|
||||||
|
|
||||||
print('starting export...')
|
print("starting export...")
|
||||||
csvfile = open('tmp/llvm-project-export.csv', 'w')
|
csvfile = open("tmp/llvm-project-export.csv", "w")
|
||||||
fieldnames = ['timestamp', 'hash', 'reviewed', 'was_reverted', 'is_revert', '# LOC changed',
|
fieldnames = [
|
||||||
'modified projects', 'author domain', 'revision']
|
"timestamp",
|
||||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames,
|
"hash",
|
||||||
dialect=csv.excel)
|
"reviewed",
|
||||||
|
"was_reverted",
|
||||||
|
"is_revert",
|
||||||
|
"# LOC changed",
|
||||||
|
"modified projects",
|
||||||
|
"author domain",
|
||||||
|
"revision",
|
||||||
|
]
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
|
||||||
writer.writeheader()
|
writer.writeheader()
|
||||||
# did not work with multiprocessing.map, gave recursion error from gitpython...
|
# did not work with multiprocessing.map, gave recursion error from gitpython...
|
||||||
# so using normal map function
|
# so using normal map function
|
||||||
|
@ -349,26 +424,29 @@ class RepoStats:
|
||||||
def _create_row(mycommit: MyCommit) -> Dict:
|
def _create_row(mycommit: MyCommit) -> Dict:
|
||||||
try:
|
try:
|
||||||
return {
|
return {
|
||||||
'timestamp': mycommit.date.isoformat(),
|
"timestamp": mycommit.date.isoformat(),
|
||||||
'hash': mycommit.chash,
|
"hash": mycommit.chash,
|
||||||
'reviewed': mycommit.was_reviewed,
|
"reviewed": mycommit.was_reviewed,
|
||||||
'was_reverted': mycommit.was_reverted,
|
"was_reverted": mycommit.was_reverted,
|
||||||
'is_revert': mycommit.is_revert,
|
"is_revert": mycommit.is_revert,
|
||||||
'# LOC changed': mycommit.num_loc,
|
"# LOC changed": mycommit.num_loc,
|
||||||
'modified projects': (';'.join(mycommit.modified_projects)),
|
"modified projects": (";".join(mycommit.modified_projects)),
|
||||||
'author domain': mycommit.author_domain,
|
"author domain": mycommit.author_domain,
|
||||||
'revision': mycommit.phab_revision if mycommit.phab_revision is not None else "",
|
"revision": mycommit.phab_revision
|
||||||
|
if mycommit.phab_revision is not None
|
||||||
|
else "",
|
||||||
}
|
}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
max_age = datetime.datetime(year=2019, month=10, day=1,
|
max_age = datetime.datetime(
|
||||||
tzinfo=datetime.timezone.utc)
|
year=2019, month=10, day=1, tzinfo=datetime.timezone.utc
|
||||||
|
)
|
||||||
now = datetime.datetime.now(tz=datetime.timezone.utc)
|
now = datetime.datetime.now(tz=datetime.timezone.utc)
|
||||||
rs = RepoStats(os.path.expanduser('~/git/llvm-project'))
|
rs = RepoStats(os.path.expanduser("~/git/llvm-project"))
|
||||||
# TODO: make the path configurable, and `git clone/pull`
|
# TODO: make the path configurable, and `git clone/pull`
|
||||||
rs.parse_repo(max_age)
|
rs.parse_repo(max_age)
|
||||||
rs.find_reverts()
|
rs.find_reverts()
|
||||||
|
@ -380,4 +458,4 @@ if __name__ == '__main__':
|
||||||
# rs.dump_unreviewed_paths(now - datetime.timedelta(days=100))
|
# rs.dump_unreviewed_paths(now - datetime.timedelta(days=100))
|
||||||
# rs.dump_loc_commits(now - datetime.timedelta(days=100))
|
# rs.dump_loc_commits(now - datetime.timedelta(days=100))
|
||||||
rs.export_commits()
|
rs.export_commits()
|
||||||
print('Done.')
|
print("Done.")
|
||||||
|
|
137
scripts/metrics/repo_hist_db.py
Executable file
137
scripts/metrics/repo_hist_db.py
Executable file
|
@ -0,0 +1,137 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import git
|
||||||
|
from repo_hist import MyCommit
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
DB_PATH = "tmp/git_hist.sqlite"
|
||||||
|
REPO_DIR = "tmp/llvm-project"
|
||||||
|
GIT_URL = "https://github.com/llvm/llvm-project.git"
|
||||||
|
GIT_BRANCH = "main"
|
||||||
|
# this was the start of using git as primary repo
|
||||||
|
MAX_AGE = datetime.datetime(year=2019, month=10, day=1, tzinfo=datetime.timezone.utc)
|
||||||
|
# Maximum age of the database before we re-create it
|
||||||
|
DB_UPDATE_INTERVAL = datetime.timedelta(days=1)
|
||||||
|
|
||||||
|
|
||||||
|
def popolate_db(
|
||||||
|
db_path: str, repo_dir: str, max_age: datetime.datetime
|
||||||
|
) -> sqlite3.Connection:
|
||||||
|
# TODO: full scan of the git history is quite slow. Maybe enable incremental
|
||||||
|
# updates. Only insert commits that are not yet in the database.
|
||||||
|
if os.path.exists(db_path):
|
||||||
|
age = datetime.datetime.now() - datetime.datetime.fromtimestamp(
|
||||||
|
os.path.getmtime(db_path)
|
||||||
|
)
|
||||||
|
if age < DB_UPDATE_INTERVAL:
|
||||||
|
print("Database is recent enough, using existing one.")
|
||||||
|
return sqlite3.connect(db_path)
|
||||||
|
os.remove(db_path)
|
||||||
|
|
||||||
|
print("Database is stale, needs updating...")
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
print("Creating tables...")
|
||||||
|
create_tables(conn)
|
||||||
|
print("Creating indexes...")
|
||||||
|
create_indexes(conn)
|
||||||
|
print("Scanning repository...")
|
||||||
|
parse_commits(conn, repo_dir, max_age)
|
||||||
|
print("Done populating database.")
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
def create_tables(conn: sqlite3.Connection):
|
||||||
|
# TODO: add more attributes as needed
|
||||||
|
conn.execute(
|
||||||
|
""" CREATE TABLE IF NOT EXISTS commits (
|
||||||
|
hash string PRIMARY KEY,
|
||||||
|
commit_time integer,
|
||||||
|
phab_id string,
|
||||||
|
reverts_hash string
|
||||||
|
); """
|
||||||
|
)
|
||||||
|
# Normalized representation of modified projects per commit.
|
||||||
|
conn.execute(
|
||||||
|
""" CREATE TABLE IF NOT EXISTS commit_project (
|
||||||
|
project string,
|
||||||
|
hash string,
|
||||||
|
FOREIGN KEY (hash) REFERENCES commits(hash)
|
||||||
|
);"""
|
||||||
|
)
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def create_indexes(conn: sqlite3.Connection):
|
||||||
|
"""Indexes to speed up searches and joins."""
|
||||||
|
conn.execute(
|
||||||
|
""" CREATE INDEX commit_project_hash
|
||||||
|
ON commit_project(hash);"""
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
""" CREATE INDEX commit_project_project
|
||||||
|
ON commit_project(project);"""
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.datetime):
|
||||||
|
if os.path.isdir(repo_dir):
|
||||||
|
print("Fetching git repo...")
|
||||||
|
repo = git.Repo(repo_dir)
|
||||||
|
repo.remotes.origin.fetch(GIT_BRANCH)
|
||||||
|
else:
|
||||||
|
print("Cloning git repo...")
|
||||||
|
git.Repo.clone_from(GIT_URL, repo_dir, bare=True)
|
||||||
|
repo = git.Repo(repo_dir)
|
||||||
|
print("repo update done.")
|
||||||
|
sql_insert_commit = """ INSERT INTO
|
||||||
|
commits (hash, commit_time, phab_id, reverts_hash)
|
||||||
|
values (?,?,?,?);
|
||||||
|
"""
|
||||||
|
sql_insert_commit_project = """ INSERT INTO
|
||||||
|
commit_project (hash, project)
|
||||||
|
values (?,?);
|
||||||
|
"""
|
||||||
|
day = None
|
||||||
|
for commit in repo.iter_commits(GIT_BRANCH):
|
||||||
|
# TODO: This takes a couple of minutes, maybe try using multithreading
|
||||||
|
if commit.committed_datetime < max_age:
|
||||||
|
break
|
||||||
|
mycommit = MyCommit(commit)
|
||||||
|
if mycommit.date.day != day:
|
||||||
|
day = mycommit.date.day
|
||||||
|
print(mycommit.date)
|
||||||
|
# take a snapshot commit, nice to see progress while updating the
|
||||||
|
# database
|
||||||
|
conn.commit()
|
||||||
|
conn.execute(
|
||||||
|
sql_insert_commit,
|
||||||
|
(
|
||||||
|
mycommit.chash,
|
||||||
|
mycommit.date,
|
||||||
|
mycommit.phab_revision,
|
||||||
|
mycommit.reverts_commit_hash,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# Note: prasing the patches is quite slow
|
||||||
|
for project in mycommit.modified_projects:
|
||||||
|
conn.execute(sql_insert_commit_project, (mycommit.chash, project))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def run_queries(conn: sqlite3.Connection):
|
||||||
|
query = """SELECT commits.hash, commits.phab_id, commits.commit_time
|
||||||
|
FROM commits
|
||||||
|
INNER JOIN commit_project ON commits.hash = commit_project.hash
|
||||||
|
WHERE commit_project.project="libcxx";"""
|
||||||
|
cursor = conn.cursor()
|
||||||
|
data = cursor.execute(query)
|
||||||
|
for row in data:
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
conn = popolate_db(DB_PATH, REPO_DIR, MAX_AGE)
|
||||||
|
run_queries(conn)
|
Loading…
Reference in a new issue