diff --git a/scripts/metrics/repo_hist.py b/scripts/metrics/repo_hist.py old mode 100644 new mode 100755 index 4f8a6ba..2ddb9de --- a/scripts/metrics/repo_hist.py +++ b/scripts/metrics/repo_hist.py @@ -25,15 +25,19 @@ import random import string REVISION_REGEX = re.compile( - r'^Differential Revision: https://reviews\.llvm\.org/(.*)$', - re.MULTILINE) + r"^Differential Revision: https://reviews\.llvm\.org/(.*)$", re.MULTILINE +) REVERT_REGEX = re.compile(r'^Revert "(.+)"') +REVERT_HASH_REGEX = re.compile("This reverts commit (\w+)", re.MULTILINE) class MyCommit: - SALT = ''.join(random.choices( - string.ascii_lowercase + string.ascii_uppercase + string.digits, k=16)) + SALT = "".join( + random.choices( + string.ascii_lowercase + string.ascii_uppercase + string.digits, k=16 + ) + ) def __init__(self, commit: git.Commit): self.commit = commit @@ -43,10 +47,11 @@ class MyCommit: self.commiter = hash(commit.committer.email.lower() + MyCommit.SALT) # type:int self.summary = commit.summary # type: str self.date = datetime.datetime.fromtimestamp( - commit.committed_date) # type: datetime.datetime + commit.committed_date + ) # type: datetime.datetime self.phab_revision = self._get_revision(commit) # type: Optional[str] - self.reverts = None # type: Optional[MyCommit] - self.reverted_by = None # type: Optional[MyCommit] + self.reverts = None # type: Optional[MyCommit] + self.reverted_by = None # type: Optional[MyCommit] self._diff_index = None # type: Optional[git.DiffIndex] @staticmethod @@ -83,20 +88,22 @@ class MyCommit: @property def week(self) -> str: - return '{}-w{:02d}'.format(self.date.year, self.date.isocalendar()[1]) + return "{}-w{:02d}".format(self.date.year, self.date.isocalendar()[1]) @property def diff_index(self) -> git.DiffIndex: # expensive operation, cache the results if self._diff_index is None: - self._diff_index = self.commit.diff(self.commit.parents[0], create_patch=True) + self._diff_index = self.commit.diff( + self.commit.parents[0], create_patch=True + ) return self._diff_index @property def num_loc(self) -> int: nloc = 0 for diff in self.diff_index: - nloc += str(diff.diff, encoding='utf8').count('\n') + nloc += str(diff.diff, encoding="utf8").count("\n") return nloc @property @@ -107,11 +114,18 @@ class MyCommit: @property def modified_projects(self) -> Set[str]: - return set(p.split('/')[0] for p in self.modified_paths) + return set(p.split("/")[0] for p in self.modified_paths) + + @property + def reverts_commit_hash(self): + m = REVERT_HASH_REGEX.search(self.commit.message) + if m is None: + # TODO: double check for "Reverts" in summary line for consistency + return None + return m.group(1) class RepoStats: - def __init__(self, git_dir: str): self.repo = git.Repo(git_dir) self.commit_by_hash = dict() # type: Dict[str, MyCommit] @@ -120,20 +134,19 @@ class RepoStats: self.commit_by_author = dict() # type: Dict[int, List[MyCommit]] self.commit_by_author_domain = dict() # type: Dict[str, List[MyCommit]] - def parse_repo(self, maxage: datetime.datetime): - for commit in self.repo.iter_commits('main'): + def parse_repo(self, maxage: datetime.datetime): + for commit in self.repo.iter_commits("main"): if commit.committed_datetime < maxage: break mycommit = MyCommit(commit) self.commit_by_hash[mycommit.chash] = mycommit - self.commit_by_summary.setdefault(mycommit.summary, [])\ - .append(mycommit) + self.commit_by_summary.setdefault(mycommit.summary, []).append(mycommit) self.commit_by_week.setdefault(mycommit.week, []).append(mycommit) - self.commit_by_author.setdefault(mycommit.author, [])\ - .append(mycommit) - self.commit_by_author_domain.setdefault(mycommit.author_domain, []) \ - .append(mycommit) - print('Read {} commits'.format(len(self.commit_by_hash))) + self.commit_by_author.setdefault(mycommit.author, []).append(mycommit) + self.commit_by_author_domain.setdefault(mycommit.author_domain, []).append( + mycommit + ) + print("Read {} commits".format(len(self.commit_by_hash))) def find_reverts(self): reverts = 0 @@ -142,119 +155,171 @@ class RepoStats: if summary is None: continue if summary not in self.commit_by_summary: - print('summary not found: {}'.format(summary)) + print("summary not found: {}".format(summary)) continue reverting_commit = self.commit_by_summary[summary][-1] commit.reverted_by = reverting_commit reverting_commit.reverts = commit reverts += 1 - print('Found {} reverts'.format(reverts)) + print("Found {} reverts".format(reverts)) # https://stackoverflow.com/questions/2600775/how-to-get-week-number-in-python def dump_daily_stats(self): - fieldnames = ["week", "num_commits", "num_reverts", "percentage_reverts", - "num_reviewed", "percentage_reviewed", - "# reviewed & revert", "# !reviewed & !revert", "# !reviewed & revert", "# reviewed & !revert"] - csvfile = open('tmp/llvm-project-weekly.csv', 'w') - writer = csv.DictWriter(csvfile, fieldnames=fieldnames, - dialect=csv.excel) + fieldnames = [ + "week", + "num_commits", + "num_reverts", + "percentage_reverts", + "num_reviewed", + "percentage_reviewed", + "# reviewed & revert", + "# !reviewed & !revert", + "# !reviewed & revert", + "# reviewed & !revert", + ] + csvfile = open("tmp/llvm-project-weekly.csv", "w") + writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel) writer.writeheader() for week in sorted(self.commit_by_week.keys()): commits = self.commit_by_week[week] num_commits = len(commits) num_reverts = len([c for c in commits if c.is_revert]) - percentage_reverts = 100.0*num_reverts / num_commits + percentage_reverts = 100.0 * num_reverts / num_commits num_reviewed = len([c for c in commits if c.was_reviewed]) - percentage_reviewed = 100*num_reviewed / (num_commits - num_reverts) - num_reviewed_revert = len([c for c in commits if c.was_reviewed and c.is_revert]) - num_reviewed_nrevert = len([c for c in commits if c.was_reviewed and not c.is_revert]) - num_nreviewed_nrevert = len([c for c in commits if not c.was_reviewed and not c.is_revert]) - num_nreviewed_revert = len([c for c in commits if not c.was_reviewed and c.is_revert]) - writer.writerow({ - "week": week, - "num_commits": num_commits, - "num_reverts": num_reverts, - "percentage_reverts": percentage_reverts, - "num_reviewed": num_reviewed, - "percentage_reviewed": percentage_reviewed, - "# reviewed & revert": num_reviewed_revert, - "# !reviewed & !revert": num_nreviewed_nrevert, - "# !reviewed & revert": num_nreviewed_revert, - "# reviewed & !revert": num_reviewed_nrevert, - }) + percentage_reviewed = 100 * num_reviewed / (num_commits - num_reverts) + num_reviewed_revert = len( + [c for c in commits if c.was_reviewed and c.is_revert] + ) + num_reviewed_nrevert = len( + [c for c in commits if c.was_reviewed and not c.is_revert] + ) + num_nreviewed_nrevert = len( + [c for c in commits if not c.was_reviewed and not c.is_revert] + ) + num_nreviewed_revert = len( + [c for c in commits if not c.was_reviewed and c.is_revert] + ) + writer.writerow( + { + "week": week, + "num_commits": num_commits, + "num_reverts": num_reverts, + "percentage_reverts": percentage_reverts, + "num_reviewed": num_reviewed, + "percentage_reviewed": percentage_reviewed, + "# reviewed & revert": num_reviewed_revert, + "# !reviewed & !revert": num_nreviewed_nrevert, + "# !reviewed & revert": num_nreviewed_revert, + "# reviewed & !revert": num_reviewed_nrevert, + } + ) def dump_overall_stats(self): num_commits = len(self.commit_by_hash) - num_reverts = len([c for c in self.commit_by_hash.values() - if c.is_revert]) - print('Number of commits: {}'.format(num_commits)) - print('Number of reverts: {}'.format(num_reverts)) - print('percentage of reverts: {:0.2f}'.format( - 100*num_reverts / num_commits)) + num_reverts = len([c for c in self.commit_by_hash.values() if c.is_revert]) + print("Number of commits: {}".format(num_commits)) + print("Number of reverts: {}".format(num_reverts)) + print("percentage of reverts: {:0.2f}".format(100 * num_reverts / num_commits)) - num_reviewed = len([c for c in self.commit_by_hash.values() - if c.was_reviewed]) - print('Number of reviewed commits: {}'.format(num_reviewed)) - print('percentage of reviewed commits: {:0.2f}'.format( - 100*num_reviewed / num_commits)) + num_reviewed = len([c for c in self.commit_by_hash.values() if c.was_reviewed]) + print("Number of reviewed commits: {}".format(num_reviewed)) + print( + "percentage of reviewed commits: {:0.2f}".format( + 100 * num_reviewed / num_commits + ) + ) - num_reviewed_reverted = len([c for c in self.commit_by_hash.values() - if c.was_reviewed and c.was_reverted]) - num_not_reviewed_reverted = len([c for c in self.commit_by_hash.values() - if not c.was_reviewed and - c.was_reverted]) - print('Number of reviewed that were reverted: {}'.format(num_reviewed_reverted)) - print('Number of NOT reviewed that were reverted: {}'.format(num_not_reviewed_reverted)) - print('percentage of reviewed that were reverted: {:0.2f}'.format( - 100*num_reviewed_reverted / num_reviewed)) - print('percentage of NOT reviewed that were reverted: {:0.2f}'.format( - 100*num_not_reviewed_reverted / (num_commits-num_reviewed))) + num_reviewed_reverted = len( + [ + c + for c in self.commit_by_hash.values() + if c.was_reviewed and c.was_reverted + ] + ) + num_not_reviewed_reverted = len( + [ + c + for c in self.commit_by_hash.values() + if not c.was_reviewed and c.was_reverted + ] + ) + print("Number of reviewed that were reverted: {}".format(num_reviewed_reverted)) + print( + "Number of NOT reviewed that were reverted: {}".format( + num_not_reviewed_reverted + ) + ) + print( + "percentage of reviewed that were reverted: {:0.2f}".format( + 100 * num_reviewed_reverted / num_reviewed + ) + ) + print( + "percentage of NOT reviewed that were reverted: {:0.2f}".format( + 100 * num_not_reviewed_reverted / (num_commits - num_reviewed) + ) + ) - num_foreign_committer = len([c for c in self.commit_by_hash.values() - if c.author != c.commiter]) - print('Number of commits where author != committer: {}'.format( - num_foreign_committer)) - print('Percentage of commits where author != committer: {:0.2f}'.format( - 100*num_foreign_committer/num_commits)) + num_foreign_committer = len( + [c for c in self.commit_by_hash.values() if c.author != c.commiter] + ) + print( + "Number of commits where author != committer: {}".format( + num_foreign_committer + ) + ) + print( + "Percentage of commits where author != committer: {:0.2f}".format( + 100 * num_foreign_committer / num_commits + ) + ) def dump_author_stats(self): - print('Number of authors: {}'.format(len(self.commit_by_author))) - fieldnames = ["author", "num_commits", "num_reverts", "percentage_reverts", - "num_reviewed", "percentage_reviewed"] - csvfile = open('tmp/llvm-project-authors.csv', 'w') - writer = csv.DictWriter(csvfile, fieldnames=fieldnames, - dialect=csv.excel) + print("Number of authors: {}".format(len(self.commit_by_author))) + fieldnames = [ + "author", + "num_commits", + "num_reverts", + "percentage_reverts", + "num_reviewed", + "percentage_reviewed", + ] + csvfile = open("tmp/llvm-project-authors.csv", "w") + writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel) writer.writeheader() for author, commits in self.commit_by_author.items(): num_commits = len(commits) num_reverts = len([c for c in commits if c.was_reverted]) percentage_reverts = 100 * num_reverts / num_commits num_reviewed = len([c for c in commits if c.was_reviewed]) - percentage_reviewed = 100*num_reviewed / num_commits - writer.writerow({ - "author": author, - "num_commits": num_commits, - "num_reverts": num_reverts, - "percentage_reverts": percentage_reverts, - "num_reviewed": num_reviewed, - "percentage_reviewed": percentage_reviewed, - }) + percentage_reviewed = 100 * num_reviewed / num_commits + writer.writerow( + { + "author": author, + "num_commits": num_commits, + "num_reverts": num_reverts, + "percentage_reverts": percentage_reverts, + "num_reviewed": num_reviewed, + "percentage_reviewed": percentage_reviewed, + } + ) def dump_author_domain_stats(self): - print('Number of authors: {}'.format(len(self.commit_by_author))) + print("Number of authors: {}".format(len(self.commit_by_author))) fieldnames = ["author_domain", "num_commits", "num_committers"] - csvfile = open('tmp/llvm-project-author_domains.csv', 'w') - writer = csv.DictWriter(csvfile, fieldnames=fieldnames, - dialect=csv.excel) + csvfile = open("tmp/llvm-project-author_domains.csv", "w") + writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel) writer.writeheader() for author_domain, commits in self.commit_by_author_domain.items(): num_commits = len(commits) committers = set(c.author for c in commits) - writer.writerow({ - "author_domain": author_domain, - "num_commits": num_commits, - "num_committers": len(committers), - }) + writer.writerow( + { + "author_domain": author_domain, + "num_commits": num_commits, + "num_committers": len(committers), + } + ) def dump_unreviewed_paths(self, maxage: datetime.datetime): # TODO: this is really slow. Maybe parallelize? @@ -262,23 +327,22 @@ class RepoStats: True: {}, False: {}, } # type: Dict[bool, Dict[str, int]] - for commit in self.repo.iter_commits('main'): + for commit in self.repo.iter_commits("main"): if commit.committed_datetime < maxage: break mycommit = MyCommit(commit) - for prefix in set(p.split('/')[0] for p in mycommit.modified_paths): + for prefix in set(p.split("/")[0] for p in mycommit.modified_paths): path_count[mycommit.was_reviewed].setdefault(prefix, 0) path_count[mycommit.was_reviewed][prefix] += 1 - fieldnames = ['was_reviewed'] + fieldnames = ["was_reviewed"] all_paths = set(path_count[True].keys()) all_paths.update(path_count[False].keys()) fieldnames.extend(sorted(all_paths)) - csvfile = open('tmp/llvm-project-unreviewed-paths.csv', 'w') - writer = csv.DictWriter(csvfile, fieldnames=fieldnames, - dialect=csv.excel) + csvfile = open("tmp/llvm-project-unreviewed-paths.csv", "w") + writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel) writer.writeheader() for reviewed in [True, False]: - row = {'was_reviewed': reviewed} + row = {"was_reviewed": reviewed} for path, count in path_count[reviewed].items(): row[path] = count writer.writerow(row) @@ -295,31 +359,34 @@ class RepoStats: True: {b: 0 for b in buckets}, False: {b: 0 for b in buckets}, } # type: Dict[bool, Dict[int, int]] - for commit in self.repo.iter_commits('main'): + for commit in self.repo.iter_commits("main"): if commit.committed_datetime < maxage: break mycommit = self.commit_by_hash[commit.hexsha] - review_dict[mycommit.was_reviewed][self._find_bucket(mycommit.num_loc, buckets)] += 1 - reverted_dict[mycommit.was_reverted][self._find_bucket(mycommit.num_loc, buckets)] += 1 - fieldnames = ['was_reviewed'] - for i in range(0, len(buckets)-1): - fieldnames.append('{}-{}'.format(buckets[i], buckets[i+1]-1)) - fieldnames.append('>={}'.format(buckets[-1])) - csvfile = open('tmp/llvm-project-unreviewed-loc.csv', 'w') - writer = csv.DictWriter(csvfile, fieldnames=fieldnames, - dialect=csv.excel) + review_dict[mycommit.was_reviewed][ + self._find_bucket(mycommit.num_loc, buckets) + ] += 1 + reverted_dict[mycommit.was_reverted][ + self._find_bucket(mycommit.num_loc, buckets) + ] += 1 + fieldnames = ["was_reviewed"] + for i in range(0, len(buckets) - 1): + fieldnames.append("{}-{}".format(buckets[i], buckets[i + 1] - 1)) + fieldnames.append(">={}".format(buckets[-1])) + csvfile = open("tmp/llvm-project-unreviewed-loc.csv", "w") + writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel) writer.writeheader() for reviewed in [True, False]: - row = {'was_reviewed': reviewed} + row = {"was_reviewed": reviewed} for i in range(0, len(buckets)): - row[fieldnames[i+1]] = review_dict[reviewed][buckets[i]] + row[fieldnames[i + 1]] = review_dict[reviewed][buckets[i]] writer.writerow(row) - writer.writerow({'was_reviewed': 'reverted'}) + writer.writerow({"was_reviewed": "reverted"}) for reverted in [True, False]: - row = {'was_reviewed': reverted} + row = {"was_reviewed": reverted} for i in range(0, len(buckets)): - row[fieldnames[i+1]] = reverted_dict[reverted][buckets[i]] + row[fieldnames[i + 1]] = reverted_dict[reverted][buckets[i]] writer.writerow(row) csvfile.close() @@ -332,12 +399,20 @@ class RepoStats: def export_commits(self): - print('starting export...') - csvfile = open('tmp/llvm-project-export.csv', 'w') - fieldnames = ['timestamp', 'hash', 'reviewed', 'was_reverted', 'is_revert', '# LOC changed', - 'modified projects', 'author domain', 'revision'] - writer = csv.DictWriter(csvfile, fieldnames=fieldnames, - dialect=csv.excel) + print("starting export...") + csvfile = open("tmp/llvm-project-export.csv", "w") + fieldnames = [ + "timestamp", + "hash", + "reviewed", + "was_reverted", + "is_revert", + "# LOC changed", + "modified projects", + "author domain", + "revision", + ] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel) writer.writeheader() # did not work with multiprocessing.map, gave recursion error from gitpython... # so using normal map function @@ -349,26 +424,29 @@ class RepoStats: def _create_row(mycommit: MyCommit) -> Dict: try: return { - 'timestamp': mycommit.date.isoformat(), - 'hash': mycommit.chash, - 'reviewed': mycommit.was_reviewed, - 'was_reverted': mycommit.was_reverted, - 'is_revert': mycommit.is_revert, - '# LOC changed': mycommit.num_loc, - 'modified projects': (';'.join(mycommit.modified_projects)), - 'author domain': mycommit.author_domain, - 'revision': mycommit.phab_revision if mycommit.phab_revision is not None else "", + "timestamp": mycommit.date.isoformat(), + "hash": mycommit.chash, + "reviewed": mycommit.was_reviewed, + "was_reverted": mycommit.was_reverted, + "is_revert": mycommit.is_revert, + "# LOC changed": mycommit.num_loc, + "modified projects": (";".join(mycommit.modified_projects)), + "author domain": mycommit.author_domain, + "revision": mycommit.phab_revision + if mycommit.phab_revision is not None + else "", } except Exception as e: print(e) return {} -if __name__ == '__main__': - max_age = datetime.datetime(year=2019, month=10, day=1, - tzinfo=datetime.timezone.utc) +if __name__ == "__main__": + max_age = datetime.datetime( + year=2019, month=10, day=1, tzinfo=datetime.timezone.utc + ) now = datetime.datetime.now(tz=datetime.timezone.utc) - rs = RepoStats(os.path.expanduser('~/git/llvm-project')) + rs = RepoStats(os.path.expanduser("~/git/llvm-project")) # TODO: make the path configurable, and `git clone/pull` rs.parse_repo(max_age) rs.find_reverts() @@ -380,4 +458,4 @@ if __name__ == '__main__': # rs.dump_unreviewed_paths(now - datetime.timedelta(days=100)) # rs.dump_loc_commits(now - datetime.timedelta(days=100)) rs.export_commits() - print('Done.') + print("Done.") diff --git a/scripts/metrics/repo_hist_db.py b/scripts/metrics/repo_hist_db.py new file mode 100755 index 0000000..4346978 --- /dev/null +++ b/scripts/metrics/repo_hist_db.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +import os +import sqlite3 +import git +from repo_hist import MyCommit +import datetime + +DB_PATH = "tmp/git_hist.sqlite" +REPO_DIR = "tmp/llvm-project" +GIT_URL = "https://github.com/llvm/llvm-project.git" +GIT_BRANCH = "main" +# this was the start of using git as primary repo +MAX_AGE = datetime.datetime(year=2019, month=10, day=1, tzinfo=datetime.timezone.utc) +# Maximum age of the database before we re-create it +DB_UPDATE_INTERVAL = datetime.timedelta(days=1) + + +def popolate_db( + db_path: str, repo_dir: str, max_age: datetime.datetime +) -> sqlite3.Connection: + # TODO: full scan of the git history is quite slow. Maybe enable incremental + # updates. Only insert commits that are not yet in the database. + if os.path.exists(db_path): + age = datetime.datetime.now() - datetime.datetime.fromtimestamp( + os.path.getmtime(db_path) + ) + if age < DB_UPDATE_INTERVAL: + print("Database is recent enough, using existing one.") + return sqlite3.connect(db_path) + os.remove(db_path) + + print("Database is stale, needs updating...") + conn = sqlite3.connect(db_path) + print("Creating tables...") + create_tables(conn) + print("Creating indexes...") + create_indexes(conn) + print("Scanning repository...") + parse_commits(conn, repo_dir, max_age) + print("Done populating database.") + return conn + + +def create_tables(conn: sqlite3.Connection): + # TODO: add more attributes as needed + conn.execute( + """ CREATE TABLE IF NOT EXISTS commits ( + hash string PRIMARY KEY, + commit_time integer, + phab_id string, + reverts_hash string + ); """ + ) + # Normalized representation of modified projects per commit. + conn.execute( + """ CREATE TABLE IF NOT EXISTS commit_project ( + project string, + hash string, + FOREIGN KEY (hash) REFERENCES commits(hash) + );""" + ) + + conn.commit() + + +def create_indexes(conn: sqlite3.Connection): + """Indexes to speed up searches and joins.""" + conn.execute( + """ CREATE INDEX commit_project_hash + ON commit_project(hash);""" + ) + conn.execute( + """ CREATE INDEX commit_project_project + ON commit_project(project);""" + ) + conn.commit() + + +def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.datetime): + if os.path.isdir(repo_dir): + print("Fetching git repo...") + repo = git.Repo(repo_dir) + repo.remotes.origin.fetch(GIT_BRANCH) + else: + print("Cloning git repo...") + git.Repo.clone_from(GIT_URL, repo_dir, bare=True) + repo = git.Repo(repo_dir) + print("repo update done.") + sql_insert_commit = """ INSERT INTO + commits (hash, commit_time, phab_id, reverts_hash) + values (?,?,?,?); + """ + sql_insert_commit_project = """ INSERT INTO + commit_project (hash, project) + values (?,?); + """ + day = None + for commit in repo.iter_commits(GIT_BRANCH): + # TODO: This takes a couple of minutes, maybe try using multithreading + if commit.committed_datetime < max_age: + break + mycommit = MyCommit(commit) + if mycommit.date.day != day: + day = mycommit.date.day + print(mycommit.date) + # take a snapshot commit, nice to see progress while updating the + # database + conn.commit() + conn.execute( + sql_insert_commit, + ( + mycommit.chash, + mycommit.date, + mycommit.phab_revision, + mycommit.reverts_commit_hash, + ), + ) + # Note: prasing the patches is quite slow + for project in mycommit.modified_projects: + conn.execute(sql_insert_commit_project, (mycommit.chash, project)) + conn.commit() + + +def run_queries(conn: sqlite3.Connection): + query = """SELECT commits.hash, commits.phab_id, commits.commit_time + FROM commits + INNER JOIN commit_project ON commits.hash = commit_project.hash + WHERE commit_project.project="libcxx";""" + cursor = conn.cursor() + data = cursor.execute(query) + for row in data: + print(row) + + +if __name__ == "__main__": + conn = popolate_db(DB_PATH, REPO_DIR, MAX_AGE) + run_queries(conn)