2021-04-27 09:56:24 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
import os
|
2021-05-10 16:50:51 +02:00
|
|
|
import psycopg2
|
2021-04-27 09:56:24 +02:00
|
|
|
import git
|
|
|
|
from repo_hist import MyCommit
|
|
|
|
import datetime
|
2021-04-27 16:42:38 +02:00
|
|
|
import csv
|
2021-05-10 16:50:51 +02:00
|
|
|
from typing import Set
|
2021-04-27 09:56:24 +02:00
|
|
|
|
2021-05-10 16:50:51 +02:00
|
|
|
# TODO: make his path configurable for use on the server
|
2021-04-27 09:56:24 +02:00
|
|
|
REPO_DIR = "tmp/llvm-project"
|
|
|
|
GIT_URL = "https://github.com/llvm/llvm-project.git"
|
|
|
|
GIT_BRANCH = "main"
|
2021-04-27 16:42:38 +02:00
|
|
|
OUTPUT_PATH = "tmp"
|
|
|
|
|
2021-04-27 09:56:24 +02:00
|
|
|
# this was the start of using git as primary repo
|
|
|
|
MAX_AGE = datetime.datetime(year=2019, month=10, day=1, tzinfo=datetime.timezone.utc)
|
2021-05-10 16:50:51 +02:00
|
|
|
|
|
|
|
|
|
|
|
def connect_to_db() -> psycopg2.extensions.connection:
|
|
|
|
"""Connect to the database, return connection object."""
|
|
|
|
conn = psycopg2.connect(
|
|
|
|
"host=127.0.0.1 sslmode=disable dbname=stats user={} password={}".format(
|
|
|
|
os.environ["PGUSER"], os.environ["PGPASSWORD"]
|
2021-04-27 09:56:24 +02:00
|
|
|
)
|
2021-05-10 16:50:51 +02:00
|
|
|
)
|
2021-04-27 09:56:24 +02:00
|
|
|
return conn
|
|
|
|
|
|
|
|
|
2021-05-10 16:50:51 +02:00
|
|
|
def create_tables(conn: psycopg2.extensions.connection):
|
|
|
|
"""Create database tables if needed."""
|
2021-04-27 09:56:24 +02:00
|
|
|
# TODO: add more attributes as needed
|
2021-04-27 16:42:38 +02:00
|
|
|
# TODO: add all projects as columns
|
2021-05-10 16:50:51 +02:00
|
|
|
print("Creating tables as needed...")
|
|
|
|
cur = conn.cursor()
|
2021-04-27 16:42:38 +02:00
|
|
|
# mod_<project> column: files in the subfolder (=project) <project> were
|
|
|
|
# modified by this commit.
|
2021-05-10 16:50:51 +02:00
|
|
|
# git hashes are 40 characters long, so using char(40) data type here
|
|
|
|
cur.execute(
|
|
|
|
""" CREATE TABLE IF NOT EXISTS git_commits (
|
|
|
|
hash char(40) PRIMARY KEY,
|
|
|
|
commit_time timestamp,
|
|
|
|
phab_id text,
|
|
|
|
reverts_hash char(40),
|
2021-04-27 16:42:38 +02:00
|
|
|
mod_llvm boolean,
|
|
|
|
mod_clang boolean,
|
|
|
|
mod_libcxx boolean,
|
|
|
|
mod_mlir boolean
|
2021-04-27 09:56:24 +02:00
|
|
|
); """
|
|
|
|
)
|
|
|
|
|
|
|
|
conn.commit()
|
|
|
|
|
|
|
|
|
2021-05-10 16:50:51 +02:00
|
|
|
def get_existing_hashes(conn: psycopg2.extensions.connection) -> Set[str]:
|
|
|
|
"""Fetch all stored git hashes from the database."""
|
|
|
|
print("Fetching known git hashes from the database...")
|
|
|
|
cur = conn.cursor()
|
|
|
|
cur.execute("SELECT hash from git_commits;")
|
|
|
|
return set((row[0] for row in cur.fetchall()))
|
|
|
|
|
|
|
|
|
|
|
|
def update_repo(repo_dir: str) -> git.Repo:
|
|
|
|
"""Clone or fetch local copy of the git repository."""
|
2021-04-27 09:56:24 +02:00
|
|
|
if os.path.isdir(repo_dir):
|
|
|
|
print("Fetching git repo...")
|
|
|
|
repo = git.Repo(repo_dir)
|
|
|
|
repo.remotes.origin.fetch(GIT_BRANCH)
|
|
|
|
else:
|
|
|
|
print("Cloning git repo...")
|
|
|
|
git.Repo.clone_from(GIT_URL, repo_dir, bare=True)
|
|
|
|
repo = git.Repo(repo_dir)
|
|
|
|
print("repo update done.")
|
2021-05-10 16:50:51 +02:00
|
|
|
return repo
|
|
|
|
|
|
|
|
|
|
|
|
def parse_commits(
|
|
|
|
conn: psycopg2.extensions.connection, repo: git.Repo, max_age: datetime.datetime
|
|
|
|
):
|
|
|
|
"""Parse the git repo history and upload it to the database."""
|
|
|
|
|
2021-04-27 09:56:24 +02:00
|
|
|
sql_insert_commit = """ INSERT INTO
|
2021-05-10 16:50:51 +02:00
|
|
|
git_commits (hash, commit_time, phab_id, reverts_hash)
|
|
|
|
values (%s,%s,%s,%s);
|
2021-04-27 09:56:24 +02:00
|
|
|
"""
|
2021-05-10 16:50:51 +02:00
|
|
|
sql_update_commit_project = (
|
|
|
|
""" UPDATE git_commits SET mod_{} = %s where hash = %s;"""
|
|
|
|
)
|
|
|
|
known_hashes = get_existing_hashes(conn)
|
2021-04-27 09:56:24 +02:00
|
|
|
day = None
|
2021-05-10 16:50:51 +02:00
|
|
|
cur = conn.cursor()
|
2021-04-27 09:56:24 +02:00
|
|
|
for commit in repo.iter_commits(GIT_BRANCH):
|
|
|
|
# TODO: This takes a couple of minutes, maybe try using multithreading
|
2021-05-10 16:50:51 +02:00
|
|
|
|
|
|
|
# Only store new/unknown hashes
|
|
|
|
if commit.hexsha in known_hashes:
|
|
|
|
continue
|
2021-04-27 09:56:24 +02:00
|
|
|
if commit.committed_datetime < max_age:
|
|
|
|
break
|
|
|
|
mycommit = MyCommit(commit)
|
|
|
|
if mycommit.date.day != day:
|
|
|
|
# take a snapshot commit, nice to see progress while updating the
|
|
|
|
# database
|
2021-05-10 16:50:51 +02:00
|
|
|
day = mycommit.date.day
|
|
|
|
print(mycommit.date)
|
2021-04-27 09:56:24 +02:00
|
|
|
conn.commit()
|
2021-05-10 16:50:51 +02:00
|
|
|
cur.execute(
|
2021-04-27 09:56:24 +02:00
|
|
|
sql_insert_commit,
|
|
|
|
(
|
|
|
|
mycommit.chash,
|
|
|
|
mycommit.date,
|
|
|
|
mycommit.phab_revision,
|
|
|
|
mycommit.reverts_commit_hash,
|
|
|
|
),
|
|
|
|
)
|
|
|
|
# Note: prasing the patches is quite slow
|
|
|
|
for project in mycommit.modified_projects:
|
2021-04-27 16:42:38 +02:00
|
|
|
# TODO find a way to make this generic for all projects, maybe user
|
|
|
|
# "ALTER TABLE" to add columns as they appear
|
2021-05-10 16:50:51 +02:00
|
|
|
# TODO: modifying the commited row is expensive, maybe find something faster
|
2021-04-27 16:42:38 +02:00
|
|
|
if project in ["llvm", "libcxx", "mlir", "clang"]:
|
2021-05-10 16:50:51 +02:00
|
|
|
cur.execute(
|
2021-04-27 16:42:38 +02:00
|
|
|
sql_update_commit_project.format(project), (True, mycommit.chash)
|
|
|
|
)
|
2021-04-27 09:56:24 +02:00
|
|
|
conn.commit()
|
|
|
|
|
|
|
|
|
2021-04-27 16:42:38 +02:00
|
|
|
def create_csv_report(title: str, query: str, output_path: str):
|
2021-04-27 09:56:24 +02:00
|
|
|
cursor = conn.cursor()
|
|
|
|
data = cursor.execute(query)
|
2021-04-27 16:42:38 +02:00
|
|
|
with open(os.path.join(output_path, title + ".csv"), "w") as csv_file:
|
|
|
|
writer = csv.writer(csv_file)
|
|
|
|
# write column headers
|
|
|
|
writer.writerow([description[0] for description in cursor.description])
|
|
|
|
for row in data:
|
|
|
|
writer.writerow(row)
|
|
|
|
|
|
|
|
|
2021-05-10 16:50:51 +02:00
|
|
|
def run_queries(conn: psycopg2.extensions.connection, output_path: str):
|
2021-04-27 16:42:38 +02:00
|
|
|
print("running queries...")
|
|
|
|
create_csv_report("full_db_dump", "select * from commits;", output_path)
|
|
|
|
|
|
|
|
query = """SELECT strftime('%Y-%m',commit_time) as month, count(hash) as num_commits, count(phab_id) as num_reviewed,
|
|
|
|
(100.0*count(phab_id)/count(hash)) as percent_reviewed, count(reverts_hash) as num_reverted,
|
|
|
|
(100.0*count(reverts_hash)/count(hash)) as percent_reverted
|
|
|
|
FROM commits
|
|
|
|
WHERE mod_{}
|
|
|
|
GROUP BY month;
|
|
|
|
"""
|
|
|
|
create_csv_report("libcxx_stats", query.format("libcxx"), output_path)
|
|
|
|
create_csv_report("mlir_stats", query.format("mlir"), output_path)
|
|
|
|
|
|
|
|
query = """SELECT strftime('%Y-%m',commit_time) as month, count(hash) as num_commits, count(phab_id) as num_reviewed,
|
|
|
|
(100.0*count(phab_id)/count(hash)) as percent_reviewed, count(reverts_hash) as num_reverted,
|
|
|
|
(100.0*count(reverts_hash)/count(hash)) as percent_reverted
|
|
|
|
FROM commits
|
|
|
|
GROUP BY month;
|
|
|
|
"""
|
|
|
|
create_csv_report("all_projects_stats", query, output_path)
|
2021-04-27 09:56:24 +02:00
|
|
|
|
|
|
|
|
2021-05-10 16:50:51 +02:00
|
|
|
def update_comits():
|
|
|
|
"""Update the git commits in the database from the git repository."""
|
|
|
|
repo = update_repo(REPO_DIR)
|
|
|
|
conn = connect_to_db()
|
|
|
|
create_tables(conn)
|
|
|
|
parse_commits(conn, repo, MAX_AGE)
|
|
|
|
|
|
|
|
|
2021-04-27 09:56:24 +02:00
|
|
|
if __name__ == "__main__":
|
2021-05-10 16:50:51 +02:00
|
|
|
update_comits()
|
|
|
|
# TODO: add argparse to switch between import and query mode or
|
|
|
|
# move queries to another file
|
|
|
|
# run_queries(conn)
|