improved git metrics script (#295)
* simplified database schema * added generic CSV export * scripts are generating first charts * dumping entire DB into CSV file * added more stats
This commit is contained in:
parent
ac4f2bcb82
commit
31cbc77e38
2 changed files with 55 additions and 40 deletions
|
@ -117,11 +117,13 @@ class MyCommit:
|
||||||
return set(p.split("/")[0] for p in self.modified_paths)
|
return set(p.split("/")[0] for p in self.modified_paths)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def reverts_commit_hash(self):
|
def reverts_commit_hash(self) -> Optional[str]:
|
||||||
m = REVERT_HASH_REGEX.search(self.commit.message)
|
m = REVERT_HASH_REGEX.search(self.commit.message)
|
||||||
if m is None:
|
if m is None:
|
||||||
# TODO: double check for "Reverts" in summary line for consistency
|
if self.reverts_summary() is None:
|
||||||
return None
|
return None
|
||||||
|
# there was a revert, but we do not know the commit hash
|
||||||
|
return "unknown"
|
||||||
return m.group(1)
|
return m.group(1)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,11 +4,14 @@ import sqlite3
|
||||||
import git
|
import git
|
||||||
from repo_hist import MyCommit
|
from repo_hist import MyCommit
|
||||||
import datetime
|
import datetime
|
||||||
|
import csv
|
||||||
|
|
||||||
DB_PATH = "tmp/git_hist.sqlite"
|
DB_PATH = "tmp/git_hist.sqlite"
|
||||||
REPO_DIR = "tmp/llvm-project"
|
REPO_DIR = "tmp/llvm-project"
|
||||||
GIT_URL = "https://github.com/llvm/llvm-project.git"
|
GIT_URL = "https://github.com/llvm/llvm-project.git"
|
||||||
GIT_BRANCH = "main"
|
GIT_BRANCH = "main"
|
||||||
|
OUTPUT_PATH = "tmp"
|
||||||
|
|
||||||
# this was the start of using git as primary repo
|
# this was the start of using git as primary repo
|
||||||
MAX_AGE = datetime.datetime(year=2019, month=10, day=1, tzinfo=datetime.timezone.utc)
|
MAX_AGE = datetime.datetime(year=2019, month=10, day=1, tzinfo=datetime.timezone.utc)
|
||||||
# Maximum age of the database before we re-create it
|
# Maximum age of the database before we re-create it
|
||||||
|
@ -33,8 +36,6 @@ def popolate_db(
|
||||||
conn = sqlite3.connect(db_path)
|
conn = sqlite3.connect(db_path)
|
||||||
print("Creating tables...")
|
print("Creating tables...")
|
||||||
create_tables(conn)
|
create_tables(conn)
|
||||||
print("Creating indexes...")
|
|
||||||
create_indexes(conn)
|
|
||||||
print("Scanning repository...")
|
print("Scanning repository...")
|
||||||
parse_commits(conn, repo_dir, max_age)
|
parse_commits(conn, repo_dir, max_age)
|
||||||
print("Done populating database.")
|
print("Done populating database.")
|
||||||
|
@ -43,39 +44,25 @@ def popolate_db(
|
||||||
|
|
||||||
def create_tables(conn: sqlite3.Connection):
|
def create_tables(conn: sqlite3.Connection):
|
||||||
# TODO: add more attributes as needed
|
# TODO: add more attributes as needed
|
||||||
|
# TODO: add all projects as columns
|
||||||
|
# mod_<project> column: files in the subfolder (=project) <project> were
|
||||||
|
# modified by this commit.
|
||||||
conn.execute(
|
conn.execute(
|
||||||
""" CREATE TABLE IF NOT EXISTS commits (
|
""" CREATE TABLE IF NOT EXISTS commits (
|
||||||
hash string PRIMARY KEY,
|
hash string PRIMARY KEY,
|
||||||
commit_time integer,
|
commit_time integer,
|
||||||
phab_id string,
|
phab_id string,
|
||||||
reverts_hash string
|
reverts_hash string,
|
||||||
|
mod_llvm boolean,
|
||||||
|
mod_clang boolean,
|
||||||
|
mod_libcxx boolean,
|
||||||
|
mod_mlir boolean
|
||||||
); """
|
); """
|
||||||
)
|
)
|
||||||
# Normalized representation of modified projects per commit.
|
|
||||||
conn.execute(
|
|
||||||
""" CREATE TABLE IF NOT EXISTS commit_project (
|
|
||||||
project string,
|
|
||||||
hash string,
|
|
||||||
FOREIGN KEY (hash) REFERENCES commits(hash)
|
|
||||||
);"""
|
|
||||||
)
|
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def create_indexes(conn: sqlite3.Connection):
|
|
||||||
"""Indexes to speed up searches and joins."""
|
|
||||||
conn.execute(
|
|
||||||
""" CREATE INDEX commit_project_hash
|
|
||||||
ON commit_project(hash);"""
|
|
||||||
)
|
|
||||||
conn.execute(
|
|
||||||
""" CREATE INDEX commit_project_project
|
|
||||||
ON commit_project(project);"""
|
|
||||||
)
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
|
|
||||||
def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.datetime):
|
def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.datetime):
|
||||||
if os.path.isdir(repo_dir):
|
if os.path.isdir(repo_dir):
|
||||||
print("Fetching git repo...")
|
print("Fetching git repo...")
|
||||||
|
@ -90,10 +77,8 @@ def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.dat
|
||||||
commits (hash, commit_time, phab_id, reverts_hash)
|
commits (hash, commit_time, phab_id, reverts_hash)
|
||||||
values (?,?,?,?);
|
values (?,?,?,?);
|
||||||
"""
|
"""
|
||||||
sql_insert_commit_project = """ INSERT INTO
|
sql_update_commit_project = """ UPDATE commits SET mod_{} = ? where hash = ?;"""
|
||||||
commit_project (hash, project)
|
|
||||||
values (?,?);
|
|
||||||
"""
|
|
||||||
day = None
|
day = None
|
||||||
for commit in repo.iter_commits(GIT_BRANCH):
|
for commit in repo.iter_commits(GIT_BRANCH):
|
||||||
# TODO: This takes a couple of minutes, maybe try using multithreading
|
# TODO: This takes a couple of minutes, maybe try using multithreading
|
||||||
|
@ -117,21 +102,49 @@ def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.dat
|
||||||
)
|
)
|
||||||
# Note: prasing the patches is quite slow
|
# Note: prasing the patches is quite slow
|
||||||
for project in mycommit.modified_projects:
|
for project in mycommit.modified_projects:
|
||||||
conn.execute(sql_insert_commit_project, (mycommit.chash, project))
|
# TODO find a way to make this generic for all projects, maybe user
|
||||||
|
# "ALTER TABLE" to add columns as they appear
|
||||||
|
if project in ["llvm", "libcxx", "mlir", "clang"]:
|
||||||
|
conn.execute(
|
||||||
|
sql_update_commit_project.format(project), (True, mycommit.chash)
|
||||||
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def run_queries(conn: sqlite3.Connection):
|
def create_csv_report(title: str, query: str, output_path: str):
|
||||||
query = """SELECT commits.hash, commits.phab_id, commits.commit_time
|
|
||||||
FROM commits
|
|
||||||
INNER JOIN commit_project ON commits.hash = commit_project.hash
|
|
||||||
WHERE commit_project.project="libcxx";"""
|
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
data = cursor.execute(query)
|
data = cursor.execute(query)
|
||||||
for row in data:
|
with open(os.path.join(output_path, title + ".csv"), "w") as csv_file:
|
||||||
print(row)
|
writer = csv.writer(csv_file)
|
||||||
|
# write column headers
|
||||||
|
writer.writerow([description[0] for description in cursor.description])
|
||||||
|
for row in data:
|
||||||
|
writer.writerow(row)
|
||||||
|
|
||||||
|
|
||||||
|
def run_queries(conn: sqlite3.Connection, output_path: str):
|
||||||
|
print("running queries...")
|
||||||
|
create_csv_report("full_db_dump", "select * from commits;", output_path)
|
||||||
|
|
||||||
|
query = """SELECT strftime('%Y-%m',commit_time) as month, count(hash) as num_commits, count(phab_id) as num_reviewed,
|
||||||
|
(100.0*count(phab_id)/count(hash)) as percent_reviewed, count(reverts_hash) as num_reverted,
|
||||||
|
(100.0*count(reverts_hash)/count(hash)) as percent_reverted
|
||||||
|
FROM commits
|
||||||
|
WHERE mod_{}
|
||||||
|
GROUP BY month;
|
||||||
|
"""
|
||||||
|
create_csv_report("libcxx_stats", query.format("libcxx"), output_path)
|
||||||
|
create_csv_report("mlir_stats", query.format("mlir"), output_path)
|
||||||
|
|
||||||
|
query = """SELECT strftime('%Y-%m',commit_time) as month, count(hash) as num_commits, count(phab_id) as num_reviewed,
|
||||||
|
(100.0*count(phab_id)/count(hash)) as percent_reviewed, count(reverts_hash) as num_reverted,
|
||||||
|
(100.0*count(reverts_hash)/count(hash)) as percent_reverted
|
||||||
|
FROM commits
|
||||||
|
GROUP BY month;
|
||||||
|
"""
|
||||||
|
create_csv_report("all_projects_stats", query, output_path)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
conn = popolate_db(DB_PATH, REPO_DIR, MAX_AGE)
|
conn = popolate_db(DB_PATH, REPO_DIR, MAX_AGE)
|
||||||
run_queries(conn)
|
run_queries(conn, OUTPUT_PATH)
|
||||||
|
|
Loading…
Reference in a new issue