From 134ca4b80139590db9348b63de0ea4653e61cd0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=BChnel?= Date: Mon, 10 May 2021 16:50:51 +0200 Subject: [PATCH] repo_hist_db now using postgres DB Instead of a local database, this script now imports the data into the shared postgres database. this way the data can be used for other queries as well. feel free to extend the data model if you need additional columns. --- scripts/metrics/repo_hist_db.py | 112 +++++++++++++++++++------------- 1 file changed, 68 insertions(+), 44 deletions(-) diff --git a/scripts/metrics/repo_hist_db.py b/scripts/metrics/repo_hist_db.py index ecb1bdb..3b3b2b9 100755 --- a/scripts/metrics/repo_hist_db.py +++ b/scripts/metrics/repo_hist_db.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 import os -import sqlite3 +import psycopg2 import git from repo_hist import MyCommit import datetime import csv +from typing import Set -DB_PATH = "tmp/git_hist.sqlite" +# TODO: make his path configurable for use on the server REPO_DIR = "tmp/llvm-project" GIT_URL = "https://github.com/llvm/llvm-project.git" GIT_BRANCH = "main" @@ -14,45 +15,33 @@ OUTPUT_PATH = "tmp" # this was the start of using git as primary repo MAX_AGE = datetime.datetime(year=2019, month=10, day=1, tzinfo=datetime.timezone.utc) -# Maximum age of the database before we re-create it -DB_UPDATE_INTERVAL = datetime.timedelta(days=1) -def popolate_db( - db_path: str, repo_dir: str, max_age: datetime.datetime -) -> sqlite3.Connection: - # TODO: full scan of the git history is quite slow. Maybe enable incremental - # updates. Only insert commits that are not yet in the database. - if os.path.exists(db_path): - age = datetime.datetime.now() - datetime.datetime.fromtimestamp( - os.path.getmtime(db_path) +def connect_to_db() -> psycopg2.extensions.connection: + """Connect to the database, return connection object.""" + conn = psycopg2.connect( + "host=127.0.0.1 sslmode=disable dbname=stats user={} password={}".format( + os.environ["PGUSER"], os.environ["PGPASSWORD"] ) - if age < DB_UPDATE_INTERVAL: - print("Database is recent enough, using existing one.") - return sqlite3.connect(db_path) - os.remove(db_path) - - print("Database is stale, needs updating...") - conn = sqlite3.connect(db_path) - print("Creating tables...") - create_tables(conn) - print("Scanning repository...") - parse_commits(conn, repo_dir, max_age) - print("Done populating database.") + ) return conn -def create_tables(conn: sqlite3.Connection): +def create_tables(conn: psycopg2.extensions.connection): + """Create database tables if needed.""" # TODO: add more attributes as needed # TODO: add all projects as columns + print("Creating tables as needed...") + cur = conn.cursor() # mod_ column: files in the subfolder (=project) were # modified by this commit. - conn.execute( - """ CREATE TABLE IF NOT EXISTS commits ( - hash string PRIMARY KEY, - commit_time integer, - phab_id string, - reverts_hash string, + # git hashes are 40 characters long, so using char(40) data type here + cur.execute( + """ CREATE TABLE IF NOT EXISTS git_commits ( + hash char(40) PRIMARY KEY, + commit_time timestamp, + phab_id text, + reverts_hash char(40), mod_llvm boolean, mod_clang boolean, mod_libcxx boolean, @@ -63,7 +52,16 @@ def create_tables(conn: sqlite3.Connection): conn.commit() -def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.datetime): +def get_existing_hashes(conn: psycopg2.extensions.connection) -> Set[str]: + """Fetch all stored git hashes from the database.""" + print("Fetching known git hashes from the database...") + cur = conn.cursor() + cur.execute("SELECT hash from git_commits;") + return set((row[0] for row in cur.fetchall())) + + +def update_repo(repo_dir: str) -> git.Repo: + """Clone or fetch local copy of the git repository.""" if os.path.isdir(repo_dir): print("Fetching git repo...") repo = git.Repo(repo_dir) @@ -73,25 +71,40 @@ def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.dat git.Repo.clone_from(GIT_URL, repo_dir, bare=True) repo = git.Repo(repo_dir) print("repo update done.") - sql_insert_commit = """ INSERT INTO - commits (hash, commit_time, phab_id, reverts_hash) - values (?,?,?,?); - """ - sql_update_commit_project = """ UPDATE commits SET mod_{} = ? where hash = ?;""" + return repo + +def parse_commits( + conn: psycopg2.extensions.connection, repo: git.Repo, max_age: datetime.datetime +): + """Parse the git repo history and upload it to the database.""" + + sql_insert_commit = """ INSERT INTO + git_commits (hash, commit_time, phab_id, reverts_hash) + values (%s,%s,%s,%s); + """ + sql_update_commit_project = ( + """ UPDATE git_commits SET mod_{} = %s where hash = %s;""" + ) + known_hashes = get_existing_hashes(conn) day = None + cur = conn.cursor() for commit in repo.iter_commits(GIT_BRANCH): # TODO: This takes a couple of minutes, maybe try using multithreading + + # Only store new/unknown hashes + if commit.hexsha in known_hashes: + continue if commit.committed_datetime < max_age: break mycommit = MyCommit(commit) if mycommit.date.day != day: - day = mycommit.date.day - print(mycommit.date) # take a snapshot commit, nice to see progress while updating the # database + day = mycommit.date.day + print(mycommit.date) conn.commit() - conn.execute( + cur.execute( sql_insert_commit, ( mycommit.chash, @@ -104,8 +117,9 @@ def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.dat for project in mycommit.modified_projects: # TODO find a way to make this generic for all projects, maybe user # "ALTER TABLE" to add columns as they appear + # TODO: modifying the commited row is expensive, maybe find something faster if project in ["llvm", "libcxx", "mlir", "clang"]: - conn.execute( + cur.execute( sql_update_commit_project.format(project), (True, mycommit.chash) ) conn.commit() @@ -122,7 +136,7 @@ def create_csv_report(title: str, query: str, output_path: str): writer.writerow(row) -def run_queries(conn: sqlite3.Connection, output_path: str): +def run_queries(conn: psycopg2.extensions.connection, output_path: str): print("running queries...") create_csv_report("full_db_dump", "select * from commits;", output_path) @@ -145,6 +159,16 @@ def run_queries(conn: sqlite3.Connection, output_path: str): create_csv_report("all_projects_stats", query, output_path) +def update_comits(): + """Update the git commits in the database from the git repository.""" + repo = update_repo(REPO_DIR) + conn = connect_to_db() + create_tables(conn) + parse_commits(conn, repo, MAX_AGE) + + if __name__ == "__main__": - conn = popolate_db(DB_PATH, REPO_DIR, MAX_AGE) - run_queries(conn, OUTPUT_PATH) + update_comits() + # TODO: add argparse to switch between import and query mode or + # move queries to another file + # run_queries(conn)