Instead of a local database, this script now imports the data into the shared postgres database. this way the data can be used for other queries as well. feel free to extend the data model if you need additional columns.
174 lines
6.3 KiB
Executable file
174 lines
6.3 KiB
Executable file
#!/usr/bin/env python3
import os
import psycopg2
import git
from repo_hist import MyCommit
import datetime
import csv
from typing import Set
# TODO: make his path configurable for use on the server
REPO_DIR = "tmp/llvm-project"
GIT_URL = ""
GIT_BRANCH = "main"
# this was the start of using git as primary repo
MAX_AGE = datetime.datetime(year=2019, month=10, day=1, tzinfo=datetime.timezone.utc)
def connect_to_db() -> psycopg2.extensions.connection:
"""Connect to the database, return connection object."""
conn = psycopg2.connect(
"host= sslmode=disable dbname=stats user={} password={}".format(
os.environ["PGUSER"], os.environ["PGPASSWORD"]
return conn
def create_tables(conn: psycopg2.extensions.connection):
"""Create database tables if needed."""
# TODO: add more attributes as needed
# TODO: add all projects as columns
print("Creating tables as needed...")
cur = conn.cursor()
# mod_<project> column: files in the subfolder (=project) <project> were
# modified by this commit.
# git hashes are 40 characters long, so using char(40) data type here
""" CREATE TABLE IF NOT EXISTS git_commits (
hash char(40) PRIMARY KEY,
commit_time timestamp,
phab_id text,
reverts_hash char(40),
mod_llvm boolean,
mod_clang boolean,
mod_libcxx boolean,
mod_mlir boolean
); """
def get_existing_hashes(conn: psycopg2.extensions.connection) -> Set[str]:
"""Fetch all stored git hashes from the database."""
print("Fetching known git hashes from the database...")
cur = conn.cursor()
cur.execute("SELECT hash from git_commits;")
return set((row[0] for row in cur.fetchall()))
def update_repo(repo_dir: str) -> git.Repo:
"""Clone or fetch local copy of the git repository."""
if os.path.isdir(repo_dir):
print("Fetching git repo...")
repo = git.Repo(repo_dir)
print("Cloning git repo...")
git.Repo.clone_from(GIT_URL, repo_dir, bare=True)
repo = git.Repo(repo_dir)
print("repo update done.")
return repo
def parse_commits(
conn: psycopg2.extensions.connection, repo: git.Repo, max_age: datetime.datetime
"""Parse the git repo history and upload it to the database."""
sql_insert_commit = """ INSERT INTO
git_commits (hash, commit_time, phab_id, reverts_hash)
values (%s,%s,%s,%s);
sql_update_commit_project = (
""" UPDATE git_commits SET mod_{} = %s where hash = %s;"""
known_hashes = get_existing_hashes(conn)
day = None
cur = conn.cursor()
for commit in repo.iter_commits(GIT_BRANCH):
# TODO: This takes a couple of minutes, maybe try using multithreading
# Only store new/unknown hashes
if commit.hexsha in known_hashes:
if commit.committed_datetime < max_age:
mycommit = MyCommit(commit)
if != day:
# take a snapshot commit, nice to see progress while updating the
# database
day =
# Note: prasing the patches is quite slow
for project in mycommit.modified_projects:
# TODO find a way to make this generic for all projects, maybe user
# "ALTER TABLE" to add columns as they appear
# TODO: modifying the commited row is expensive, maybe find something faster
if project in ["llvm", "libcxx", "mlir", "clang"]:
sql_update_commit_project.format(project), (True, mycommit.chash)
def create_csv_report(title: str, query: str, output_path: str):
cursor = conn.cursor()
data = cursor.execute(query)
with open(os.path.join(output_path, title + ".csv"), "w") as csv_file:
writer = csv.writer(csv_file)
# write column headers
writer.writerow([description[0] for description in cursor.description])
for row in data:
def run_queries(conn: psycopg2.extensions.connection, output_path: str):
print("running queries...")
create_csv_report("full_db_dump", "select * from commits;", output_path)
query = """SELECT strftime('%Y-%m',commit_time) as month, count(hash) as num_commits, count(phab_id) as num_reviewed,
(100.0*count(phab_id)/count(hash)) as percent_reviewed, count(reverts_hash) as num_reverted,
(100.0*count(reverts_hash)/count(hash)) as percent_reverted
FROM commits
WHERE mod_{}
GROUP BY month;
create_csv_report("libcxx_stats", query.format("libcxx"), output_path)
create_csv_report("mlir_stats", query.format("mlir"), output_path)
query = """SELECT strftime('%Y-%m',commit_time) as month, count(hash) as num_commits, count(phab_id) as num_reviewed,
(100.0*count(phab_id)/count(hash)) as percent_reviewed, count(reverts_hash) as num_reverted,
(100.0*count(reverts_hash)/count(hash)) as percent_reverted
FROM commits
GROUP BY month;
create_csv_report("all_projects_stats", query, output_path)
def update_comits():
"""Update the git commits in the database from the git repository."""
repo = update_repo(REPO_DIR)
conn = connect_to_db()
parse_commits(conn, repo, MAX_AGE)
if __name__ == "__main__":
# TODO: add argparse to switch between import and query mode or
# move queries to another file
# run_queries(conn)