1
0
Fork 0

first query on DB working

This commit is contained in:
Christian Kühnel 2021-04-27 09:56:24 +02:00 committed by Mikhail Goncharov
parent 08bb6492fa
commit 06c97f1dc5
2 changed files with 356 additions and 141 deletions

360
scripts/metrics/repo_hist.py Normal file → Executable file
View file

@ -25,15 +25,19 @@ import random
import string import string
REVISION_REGEX = re.compile( REVISION_REGEX = re.compile(
r'^Differential Revision: https://reviews\.llvm\.org/(.*)$', r"^Differential Revision: https://reviews\.llvm\.org/(.*)$", re.MULTILINE
re.MULTILINE) )
REVERT_REGEX = re.compile(r'^Revert "(.+)"') REVERT_REGEX = re.compile(r'^Revert "(.+)"')
REVERT_HASH_REGEX = re.compile("This reverts commit (\w+)", re.MULTILINE)
class MyCommit: class MyCommit:
SALT = ''.join(random.choices( SALT = "".join(
string.ascii_lowercase + string.ascii_uppercase + string.digits, k=16)) random.choices(
string.ascii_lowercase + string.ascii_uppercase + string.digits, k=16
)
)
def __init__(self, commit: git.Commit): def __init__(self, commit: git.Commit):
self.commit = commit self.commit = commit
@ -43,10 +47,11 @@ class MyCommit:
self.commiter = hash(commit.committer.email.lower() + MyCommit.SALT) # type:int self.commiter = hash(commit.committer.email.lower() + MyCommit.SALT) # type:int
self.summary = commit.summary # type: str self.summary = commit.summary # type: str
self.date = datetime.datetime.fromtimestamp( self.date = datetime.datetime.fromtimestamp(
commit.committed_date) # type: datetime.datetime commit.committed_date
) # type: datetime.datetime
self.phab_revision = self._get_revision(commit) # type: Optional[str] self.phab_revision = self._get_revision(commit) # type: Optional[str]
self.reverts = None # type: Optional[MyCommit] self.reverts = None # type: Optional[MyCommit]
self.reverted_by = None # type: Optional[MyCommit] self.reverted_by = None # type: Optional[MyCommit]
self._diff_index = None # type: Optional[git.DiffIndex] self._diff_index = None # type: Optional[git.DiffIndex]
@staticmethod @staticmethod
@ -83,20 +88,22 @@ class MyCommit:
@property @property
def week(self) -> str: def week(self) -> str:
return '{}-w{:02d}'.format(self.date.year, self.date.isocalendar()[1]) return "{}-w{:02d}".format(self.date.year, self.date.isocalendar()[1])
@property @property
def diff_index(self) -> git.DiffIndex: def diff_index(self) -> git.DiffIndex:
# expensive operation, cache the results # expensive operation, cache the results
if self._diff_index is None: if self._diff_index is None:
self._diff_index = self.commit.diff(self.commit.parents[0], create_patch=True) self._diff_index = self.commit.diff(
self.commit.parents[0], create_patch=True
)
return self._diff_index return self._diff_index
@property @property
def num_loc(self) -> int: def num_loc(self) -> int:
nloc = 0 nloc = 0
for diff in self.diff_index: for diff in self.diff_index:
nloc += str(diff.diff, encoding='utf8').count('\n') nloc += str(diff.diff, encoding="utf8").count("\n")
return nloc return nloc
@property @property
@ -107,11 +114,18 @@ class MyCommit:
@property @property
def modified_projects(self) -> Set[str]: def modified_projects(self) -> Set[str]:
return set(p.split('/')[0] for p in self.modified_paths) return set(p.split("/")[0] for p in self.modified_paths)
@property
def reverts_commit_hash(self):
m = REVERT_HASH_REGEX.search(self.commit.message)
if m is None:
# TODO: double check for "Reverts" in summary line for consistency
return None
return m.group(1)
class RepoStats: class RepoStats:
def __init__(self, git_dir: str): def __init__(self, git_dir: str):
self.repo = git.Repo(git_dir) self.repo = git.Repo(git_dir)
self.commit_by_hash = dict() # type: Dict[str, MyCommit] self.commit_by_hash = dict() # type: Dict[str, MyCommit]
@ -120,20 +134,19 @@ class RepoStats:
self.commit_by_author = dict() # type: Dict[int, List[MyCommit]] self.commit_by_author = dict() # type: Dict[int, List[MyCommit]]
self.commit_by_author_domain = dict() # type: Dict[str, List[MyCommit]] self.commit_by_author_domain = dict() # type: Dict[str, List[MyCommit]]
def parse_repo(self, maxage: datetime.datetime): def parse_repo(self, maxage: datetime.datetime):
for commit in self.repo.iter_commits('main'): for commit in self.repo.iter_commits("main"):
if commit.committed_datetime < maxage: if commit.committed_datetime < maxage:
break break
mycommit = MyCommit(commit) mycommit = MyCommit(commit)
self.commit_by_hash[mycommit.chash] = mycommit self.commit_by_hash[mycommit.chash] = mycommit
self.commit_by_summary.setdefault(mycommit.summary, [])\ self.commit_by_summary.setdefault(mycommit.summary, []).append(mycommit)
.append(mycommit)
self.commit_by_week.setdefault(mycommit.week, []).append(mycommit) self.commit_by_week.setdefault(mycommit.week, []).append(mycommit)
self.commit_by_author.setdefault(mycommit.author, [])\ self.commit_by_author.setdefault(mycommit.author, []).append(mycommit)
.append(mycommit) self.commit_by_author_domain.setdefault(mycommit.author_domain, []).append(
self.commit_by_author_domain.setdefault(mycommit.author_domain, []) \ mycommit
.append(mycommit) )
print('Read {} commits'.format(len(self.commit_by_hash))) print("Read {} commits".format(len(self.commit_by_hash)))
def find_reverts(self): def find_reverts(self):
reverts = 0 reverts = 0
@ -142,119 +155,171 @@ class RepoStats:
if summary is None: if summary is None:
continue continue
if summary not in self.commit_by_summary: if summary not in self.commit_by_summary:
print('summary not found: {}'.format(summary)) print("summary not found: {}".format(summary))
continue continue
reverting_commit = self.commit_by_summary[summary][-1] reverting_commit = self.commit_by_summary[summary][-1]
commit.reverted_by = reverting_commit commit.reverted_by = reverting_commit
reverting_commit.reverts = commit reverting_commit.reverts = commit
reverts += 1 reverts += 1
print('Found {} reverts'.format(reverts)) print("Found {} reverts".format(reverts))
# https://stackoverflow.com/questions/2600775/how-to-get-week-number-in-python # https://stackoverflow.com/questions/2600775/how-to-get-week-number-in-python
def dump_daily_stats(self): def dump_daily_stats(self):
fieldnames = ["week", "num_commits", "num_reverts", "percentage_reverts", fieldnames = [
"num_reviewed", "percentage_reviewed", "week",
"# reviewed & revert", "# !reviewed & !revert", "# !reviewed & revert", "# reviewed & !revert"] "num_commits",
csvfile = open('tmp/llvm-project-weekly.csv', 'w') "num_reverts",
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, "percentage_reverts",
dialect=csv.excel) "num_reviewed",
"percentage_reviewed",
"# reviewed & revert",
"# !reviewed & !revert",
"# !reviewed & revert",
"# reviewed & !revert",
]
csvfile = open("tmp/llvm-project-weekly.csv", "w")
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
writer.writeheader() writer.writeheader()
for week in sorted(self.commit_by_week.keys()): for week in sorted(self.commit_by_week.keys()):
commits = self.commit_by_week[week] commits = self.commit_by_week[week]
num_commits = len(commits) num_commits = len(commits)
num_reverts = len([c for c in commits if c.is_revert]) num_reverts = len([c for c in commits if c.is_revert])
percentage_reverts = 100.0*num_reverts / num_commits percentage_reverts = 100.0 * num_reverts / num_commits
num_reviewed = len([c for c in commits if c.was_reviewed]) num_reviewed = len([c for c in commits if c.was_reviewed])
percentage_reviewed = 100*num_reviewed / (num_commits - num_reverts) percentage_reviewed = 100 * num_reviewed / (num_commits - num_reverts)
num_reviewed_revert = len([c for c in commits if c.was_reviewed and c.is_revert]) num_reviewed_revert = len(
num_reviewed_nrevert = len([c for c in commits if c.was_reviewed and not c.is_revert]) [c for c in commits if c.was_reviewed and c.is_revert]
num_nreviewed_nrevert = len([c for c in commits if not c.was_reviewed and not c.is_revert]) )
num_nreviewed_revert = len([c for c in commits if not c.was_reviewed and c.is_revert]) num_reviewed_nrevert = len(
writer.writerow({ [c for c in commits if c.was_reviewed and not c.is_revert]
"week": week, )
"num_commits": num_commits, num_nreviewed_nrevert = len(
"num_reverts": num_reverts, [c for c in commits if not c.was_reviewed and not c.is_revert]
"percentage_reverts": percentage_reverts, )
"num_reviewed": num_reviewed, num_nreviewed_revert = len(
"percentage_reviewed": percentage_reviewed, [c for c in commits if not c.was_reviewed and c.is_revert]
"# reviewed & revert": num_reviewed_revert, )
"# !reviewed & !revert": num_nreviewed_nrevert, writer.writerow(
"# !reviewed & revert": num_nreviewed_revert, {
"# reviewed & !revert": num_reviewed_nrevert, "week": week,
}) "num_commits": num_commits,
"num_reverts": num_reverts,
"percentage_reverts": percentage_reverts,
"num_reviewed": num_reviewed,
"percentage_reviewed": percentage_reviewed,
"# reviewed & revert": num_reviewed_revert,
"# !reviewed & !revert": num_nreviewed_nrevert,
"# !reviewed & revert": num_nreviewed_revert,
"# reviewed & !revert": num_reviewed_nrevert,
}
)
def dump_overall_stats(self): def dump_overall_stats(self):
num_commits = len(self.commit_by_hash) num_commits = len(self.commit_by_hash)
num_reverts = len([c for c in self.commit_by_hash.values() num_reverts = len([c for c in self.commit_by_hash.values() if c.is_revert])
if c.is_revert]) print("Number of commits: {}".format(num_commits))
print('Number of commits: {}'.format(num_commits)) print("Number of reverts: {}".format(num_reverts))
print('Number of reverts: {}'.format(num_reverts)) print("percentage of reverts: {:0.2f}".format(100 * num_reverts / num_commits))
print('percentage of reverts: {:0.2f}'.format(
100*num_reverts / num_commits))
num_reviewed = len([c for c in self.commit_by_hash.values() num_reviewed = len([c for c in self.commit_by_hash.values() if c.was_reviewed])
if c.was_reviewed]) print("Number of reviewed commits: {}".format(num_reviewed))
print('Number of reviewed commits: {}'.format(num_reviewed)) print(
print('percentage of reviewed commits: {:0.2f}'.format( "percentage of reviewed commits: {:0.2f}".format(
100*num_reviewed / num_commits)) 100 * num_reviewed / num_commits
)
)
num_reviewed_reverted = len([c for c in self.commit_by_hash.values() num_reviewed_reverted = len(
if c.was_reviewed and c.was_reverted]) [
num_not_reviewed_reverted = len([c for c in self.commit_by_hash.values() c
if not c.was_reviewed and for c in self.commit_by_hash.values()
c.was_reverted]) if c.was_reviewed and c.was_reverted
print('Number of reviewed that were reverted: {}'.format(num_reviewed_reverted)) ]
print('Number of NOT reviewed that were reverted: {}'.format(num_not_reviewed_reverted)) )
print('percentage of reviewed that were reverted: {:0.2f}'.format( num_not_reviewed_reverted = len(
100*num_reviewed_reverted / num_reviewed)) [
print('percentage of NOT reviewed that were reverted: {:0.2f}'.format( c
100*num_not_reviewed_reverted / (num_commits-num_reviewed))) for c in self.commit_by_hash.values()
if not c.was_reviewed and c.was_reverted
]
)
print("Number of reviewed that were reverted: {}".format(num_reviewed_reverted))
print(
"Number of NOT reviewed that were reverted: {}".format(
num_not_reviewed_reverted
)
)
print(
"percentage of reviewed that were reverted: {:0.2f}".format(
100 * num_reviewed_reverted / num_reviewed
)
)
print(
"percentage of NOT reviewed that were reverted: {:0.2f}".format(
100 * num_not_reviewed_reverted / (num_commits - num_reviewed)
)
)
num_foreign_committer = len([c for c in self.commit_by_hash.values() num_foreign_committer = len(
if c.author != c.commiter]) [c for c in self.commit_by_hash.values() if c.author != c.commiter]
print('Number of commits where author != committer: {}'.format( )
num_foreign_committer)) print(
print('Percentage of commits where author != committer: {:0.2f}'.format( "Number of commits where author != committer: {}".format(
100*num_foreign_committer/num_commits)) num_foreign_committer
)
)
print(
"Percentage of commits where author != committer: {:0.2f}".format(
100 * num_foreign_committer / num_commits
)
)
def dump_author_stats(self): def dump_author_stats(self):
print('Number of authors: {}'.format(len(self.commit_by_author))) print("Number of authors: {}".format(len(self.commit_by_author)))
fieldnames = ["author", "num_commits", "num_reverts", "percentage_reverts", fieldnames = [
"num_reviewed", "percentage_reviewed"] "author",
csvfile = open('tmp/llvm-project-authors.csv', 'w') "num_commits",
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, "num_reverts",
dialect=csv.excel) "percentage_reverts",
"num_reviewed",
"percentage_reviewed",
]
csvfile = open("tmp/llvm-project-authors.csv", "w")
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
writer.writeheader() writer.writeheader()
for author, commits in self.commit_by_author.items(): for author, commits in self.commit_by_author.items():
num_commits = len(commits) num_commits = len(commits)
num_reverts = len([c for c in commits if c.was_reverted]) num_reverts = len([c for c in commits if c.was_reverted])
percentage_reverts = 100 * num_reverts / num_commits percentage_reverts = 100 * num_reverts / num_commits
num_reviewed = len([c for c in commits if c.was_reviewed]) num_reviewed = len([c for c in commits if c.was_reviewed])
percentage_reviewed = 100*num_reviewed / num_commits percentage_reviewed = 100 * num_reviewed / num_commits
writer.writerow({ writer.writerow(
"author": author, {
"num_commits": num_commits, "author": author,
"num_reverts": num_reverts, "num_commits": num_commits,
"percentage_reverts": percentage_reverts, "num_reverts": num_reverts,
"num_reviewed": num_reviewed, "percentage_reverts": percentage_reverts,
"percentage_reviewed": percentage_reviewed, "num_reviewed": num_reviewed,
}) "percentage_reviewed": percentage_reviewed,
}
)
def dump_author_domain_stats(self): def dump_author_domain_stats(self):
print('Number of authors: {}'.format(len(self.commit_by_author))) print("Number of authors: {}".format(len(self.commit_by_author)))
fieldnames = ["author_domain", "num_commits", "num_committers"] fieldnames = ["author_domain", "num_commits", "num_committers"]
csvfile = open('tmp/llvm-project-author_domains.csv', 'w') csvfile = open("tmp/llvm-project-author_domains.csv", "w")
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
dialect=csv.excel)
writer.writeheader() writer.writeheader()
for author_domain, commits in self.commit_by_author_domain.items(): for author_domain, commits in self.commit_by_author_domain.items():
num_commits = len(commits) num_commits = len(commits)
committers = set(c.author for c in commits) committers = set(c.author for c in commits)
writer.writerow({ writer.writerow(
"author_domain": author_domain, {
"num_commits": num_commits, "author_domain": author_domain,
"num_committers": len(committers), "num_commits": num_commits,
}) "num_committers": len(committers),
}
)
def dump_unreviewed_paths(self, maxage: datetime.datetime): def dump_unreviewed_paths(self, maxage: datetime.datetime):
# TODO: this is really slow. Maybe parallelize? # TODO: this is really slow. Maybe parallelize?
@ -262,23 +327,22 @@ class RepoStats:
True: {}, True: {},
False: {}, False: {},
} # type: Dict[bool, Dict[str, int]] } # type: Dict[bool, Dict[str, int]]
for commit in self.repo.iter_commits('main'): for commit in self.repo.iter_commits("main"):
if commit.committed_datetime < maxage: if commit.committed_datetime < maxage:
break break
mycommit = MyCommit(commit) mycommit = MyCommit(commit)
for prefix in set(p.split('/')[0] for p in mycommit.modified_paths): for prefix in set(p.split("/")[0] for p in mycommit.modified_paths):
path_count[mycommit.was_reviewed].setdefault(prefix, 0) path_count[mycommit.was_reviewed].setdefault(prefix, 0)
path_count[mycommit.was_reviewed][prefix] += 1 path_count[mycommit.was_reviewed][prefix] += 1
fieldnames = ['was_reviewed'] fieldnames = ["was_reviewed"]
all_paths = set(path_count[True].keys()) all_paths = set(path_count[True].keys())
all_paths.update(path_count[False].keys()) all_paths.update(path_count[False].keys())
fieldnames.extend(sorted(all_paths)) fieldnames.extend(sorted(all_paths))
csvfile = open('tmp/llvm-project-unreviewed-paths.csv', 'w') csvfile = open("tmp/llvm-project-unreviewed-paths.csv", "w")
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
dialect=csv.excel)
writer.writeheader() writer.writeheader()
for reviewed in [True, False]: for reviewed in [True, False]:
row = {'was_reviewed': reviewed} row = {"was_reviewed": reviewed}
for path, count in path_count[reviewed].items(): for path, count in path_count[reviewed].items():
row[path] = count row[path] = count
writer.writerow(row) writer.writerow(row)
@ -295,31 +359,34 @@ class RepoStats:
True: {b: 0 for b in buckets}, True: {b: 0 for b in buckets},
False: {b: 0 for b in buckets}, False: {b: 0 for b in buckets},
} # type: Dict[bool, Dict[int, int]] } # type: Dict[bool, Dict[int, int]]
for commit in self.repo.iter_commits('main'): for commit in self.repo.iter_commits("main"):
if commit.committed_datetime < maxage: if commit.committed_datetime < maxage:
break break
mycommit = self.commit_by_hash[commit.hexsha] mycommit = self.commit_by_hash[commit.hexsha]
review_dict[mycommit.was_reviewed][self._find_bucket(mycommit.num_loc, buckets)] += 1 review_dict[mycommit.was_reviewed][
reverted_dict[mycommit.was_reverted][self._find_bucket(mycommit.num_loc, buckets)] += 1 self._find_bucket(mycommit.num_loc, buckets)
fieldnames = ['was_reviewed'] ] += 1
for i in range(0, len(buckets)-1): reverted_dict[mycommit.was_reverted][
fieldnames.append('{}-{}'.format(buckets[i], buckets[i+1]-1)) self._find_bucket(mycommit.num_loc, buckets)
fieldnames.append('>={}'.format(buckets[-1])) ] += 1
csvfile = open('tmp/llvm-project-unreviewed-loc.csv', 'w') fieldnames = ["was_reviewed"]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, for i in range(0, len(buckets) - 1):
dialect=csv.excel) fieldnames.append("{}-{}".format(buckets[i], buckets[i + 1] - 1))
fieldnames.append(">={}".format(buckets[-1]))
csvfile = open("tmp/llvm-project-unreviewed-loc.csv", "w")
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
writer.writeheader() writer.writeheader()
for reviewed in [True, False]: for reviewed in [True, False]:
row = {'was_reviewed': reviewed} row = {"was_reviewed": reviewed}
for i in range(0, len(buckets)): for i in range(0, len(buckets)):
row[fieldnames[i+1]] = review_dict[reviewed][buckets[i]] row[fieldnames[i + 1]] = review_dict[reviewed][buckets[i]]
writer.writerow(row) writer.writerow(row)
writer.writerow({'was_reviewed': 'reverted'}) writer.writerow({"was_reviewed": "reverted"})
for reverted in [True, False]: for reverted in [True, False]:
row = {'was_reviewed': reverted} row = {"was_reviewed": reverted}
for i in range(0, len(buckets)): for i in range(0, len(buckets)):
row[fieldnames[i+1]] = reverted_dict[reverted][buckets[i]] row[fieldnames[i + 1]] = reverted_dict[reverted][buckets[i]]
writer.writerow(row) writer.writerow(row)
csvfile.close() csvfile.close()
@ -332,12 +399,20 @@ class RepoStats:
def export_commits(self): def export_commits(self):
print('starting export...') print("starting export...")
csvfile = open('tmp/llvm-project-export.csv', 'w') csvfile = open("tmp/llvm-project-export.csv", "w")
fieldnames = ['timestamp', 'hash', 'reviewed', 'was_reverted', 'is_revert', '# LOC changed', fieldnames = [
'modified projects', 'author domain', 'revision'] "timestamp",
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, "hash",
dialect=csv.excel) "reviewed",
"was_reverted",
"is_revert",
"# LOC changed",
"modified projects",
"author domain",
"revision",
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, dialect=csv.excel)
writer.writeheader() writer.writeheader()
# did not work with multiprocessing.map, gave recursion error from gitpython... # did not work with multiprocessing.map, gave recursion error from gitpython...
# so using normal map function # so using normal map function
@ -349,26 +424,29 @@ class RepoStats:
def _create_row(mycommit: MyCommit) -> Dict: def _create_row(mycommit: MyCommit) -> Dict:
try: try:
return { return {
'timestamp': mycommit.date.isoformat(), "timestamp": mycommit.date.isoformat(),
'hash': mycommit.chash, "hash": mycommit.chash,
'reviewed': mycommit.was_reviewed, "reviewed": mycommit.was_reviewed,
'was_reverted': mycommit.was_reverted, "was_reverted": mycommit.was_reverted,
'is_revert': mycommit.is_revert, "is_revert": mycommit.is_revert,
'# LOC changed': mycommit.num_loc, "# LOC changed": mycommit.num_loc,
'modified projects': (';'.join(mycommit.modified_projects)), "modified projects": (";".join(mycommit.modified_projects)),
'author domain': mycommit.author_domain, "author domain": mycommit.author_domain,
'revision': mycommit.phab_revision if mycommit.phab_revision is not None else "", "revision": mycommit.phab_revision
if mycommit.phab_revision is not None
else "",
} }
except Exception as e: except Exception as e:
print(e) print(e)
return {} return {}
if __name__ == '__main__': if __name__ == "__main__":
max_age = datetime.datetime(year=2019, month=10, day=1, max_age = datetime.datetime(
tzinfo=datetime.timezone.utc) year=2019, month=10, day=1, tzinfo=datetime.timezone.utc
)
now = datetime.datetime.now(tz=datetime.timezone.utc) now = datetime.datetime.now(tz=datetime.timezone.utc)
rs = RepoStats(os.path.expanduser('~/git/llvm-project')) rs = RepoStats(os.path.expanduser("~/git/llvm-project"))
# TODO: make the path configurable, and `git clone/pull` # TODO: make the path configurable, and `git clone/pull`
rs.parse_repo(max_age) rs.parse_repo(max_age)
rs.find_reverts() rs.find_reverts()
@ -380,4 +458,4 @@ if __name__ == '__main__':
# rs.dump_unreviewed_paths(now - datetime.timedelta(days=100)) # rs.dump_unreviewed_paths(now - datetime.timedelta(days=100))
# rs.dump_loc_commits(now - datetime.timedelta(days=100)) # rs.dump_loc_commits(now - datetime.timedelta(days=100))
rs.export_commits() rs.export_commits()
print('Done.') print("Done.")

137
scripts/metrics/repo_hist_db.py Executable file
View file

@ -0,0 +1,137 @@
#!/usr/bin/env python3
import os
import sqlite3
import git
from repo_hist import MyCommit
import datetime
DB_PATH = "tmp/git_hist.sqlite"
REPO_DIR = "tmp/llvm-project"
GIT_URL = "https://github.com/llvm/llvm-project.git"
GIT_BRANCH = "main"
# this was the start of using git as primary repo
MAX_AGE = datetime.datetime(year=2019, month=10, day=1, tzinfo=datetime.timezone.utc)
# Maximum age of the database before we re-create it
DB_UPDATE_INTERVAL = datetime.timedelta(days=1)
def popolate_db(
db_path: str, repo_dir: str, max_age: datetime.datetime
) -> sqlite3.Connection:
# TODO: full scan of the git history is quite slow. Maybe enable incremental
# updates. Only insert commits that are not yet in the database.
if os.path.exists(db_path):
age = datetime.datetime.now() - datetime.datetime.fromtimestamp(
os.path.getmtime(db_path)
)
if age < DB_UPDATE_INTERVAL:
print("Database is recent enough, using existing one.")
return sqlite3.connect(db_path)
os.remove(db_path)
print("Database is stale, needs updating...")
conn = sqlite3.connect(db_path)
print("Creating tables...")
create_tables(conn)
print("Creating indexes...")
create_indexes(conn)
print("Scanning repository...")
parse_commits(conn, repo_dir, max_age)
print("Done populating database.")
return conn
def create_tables(conn: sqlite3.Connection):
# TODO: add more attributes as needed
conn.execute(
""" CREATE TABLE IF NOT EXISTS commits (
hash string PRIMARY KEY,
commit_time integer,
phab_id string,
reverts_hash string
); """
)
# Normalized representation of modified projects per commit.
conn.execute(
""" CREATE TABLE IF NOT EXISTS commit_project (
project string,
hash string,
FOREIGN KEY (hash) REFERENCES commits(hash)
);"""
)
conn.commit()
def create_indexes(conn: sqlite3.Connection):
"""Indexes to speed up searches and joins."""
conn.execute(
""" CREATE INDEX commit_project_hash
ON commit_project(hash);"""
)
conn.execute(
""" CREATE INDEX commit_project_project
ON commit_project(project);"""
)
conn.commit()
def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.datetime):
if os.path.isdir(repo_dir):
print("Fetching git repo...")
repo = git.Repo(repo_dir)
repo.remotes.origin.fetch(GIT_BRANCH)
else:
print("Cloning git repo...")
git.Repo.clone_from(GIT_URL, repo_dir, bare=True)
repo = git.Repo(repo_dir)
print("repo update done.")
sql_insert_commit = """ INSERT INTO
commits (hash, commit_time, phab_id, reverts_hash)
values (?,?,?,?);
"""
sql_insert_commit_project = """ INSERT INTO
commit_project (hash, project)
values (?,?);
"""
day = None
for commit in repo.iter_commits(GIT_BRANCH):
# TODO: This takes a couple of minutes, maybe try using multithreading
if commit.committed_datetime < max_age:
break
mycommit = MyCommit(commit)
if mycommit.date.day != day:
day = mycommit.date.day
print(mycommit.date)
# take a snapshot commit, nice to see progress while updating the
# database
conn.commit()
conn.execute(
sql_insert_commit,
(
mycommit.chash,
mycommit.date,
mycommit.phab_revision,
mycommit.reverts_commit_hash,
),
)
# Note: prasing the patches is quite slow
for project in mycommit.modified_projects:
conn.execute(sql_insert_commit_project, (mycommit.chash, project))
conn.commit()
def run_queries(conn: sqlite3.Connection):
query = """SELECT commits.hash, commits.phab_id, commits.commit_time
FROM commits
INNER JOIN commit_project ON commits.hash = commit_project.hash
WHERE commit_project.project="libcxx";"""
cursor = conn.cursor()
data = cursor.execute(query)
for row in data:
print(row)
if __name__ == "__main__":
conn = popolate_db(DB_PATH, REPO_DIR, MAX_AGE)
run_queries(conn)