repo_hist_db now using postgres DB
Instead of a local database, this script now imports the data into the shared postgres database. this way the data can be used for other queries as well. feel free to extend the data model if you need additional columns.
This commit is contained in:
parent
668e50298c
commit
134ca4b801
1 changed files with 68 additions and 44 deletions
|
@ -1,12 +1,13 @@
|
|||
#!/usr/bin/env python3
|
||||
import os
|
||||
import sqlite3
|
||||
import psycopg2
|
||||
import git
|
||||
from repo_hist import MyCommit
|
||||
import datetime
|
||||
import csv
|
||||
from typing import Set
|
||||
|
||||
DB_PATH = "tmp/git_hist.sqlite"
|
||||
# TODO: make his path configurable for use on the server
|
||||
REPO_DIR = "tmp/llvm-project"
|
||||
GIT_URL = "https://github.com/llvm/llvm-project.git"
|
||||
GIT_BRANCH = "main"
|
||||
|
@ -14,45 +15,33 @@ OUTPUT_PATH = "tmp"
|
|||
|
||||
# this was the start of using git as primary repo
|
||||
MAX_AGE = datetime.datetime(year=2019, month=10, day=1, tzinfo=datetime.timezone.utc)
|
||||
# Maximum age of the database before we re-create it
|
||||
DB_UPDATE_INTERVAL = datetime.timedelta(days=1)
|
||||
|
||||
|
||||
def popolate_db(
|
||||
db_path: str, repo_dir: str, max_age: datetime.datetime
|
||||
) -> sqlite3.Connection:
|
||||
# TODO: full scan of the git history is quite slow. Maybe enable incremental
|
||||
# updates. Only insert commits that are not yet in the database.
|
||||
if os.path.exists(db_path):
|
||||
age = datetime.datetime.now() - datetime.datetime.fromtimestamp(
|
||||
os.path.getmtime(db_path)
|
||||
def connect_to_db() -> psycopg2.extensions.connection:
|
||||
"""Connect to the database, return connection object."""
|
||||
conn = psycopg2.connect(
|
||||
"host=127.0.0.1 sslmode=disable dbname=stats user={} password={}".format(
|
||||
os.environ["PGUSER"], os.environ["PGPASSWORD"]
|
||||
)
|
||||
if age < DB_UPDATE_INTERVAL:
|
||||
print("Database is recent enough, using existing one.")
|
||||
return sqlite3.connect(db_path)
|
||||
os.remove(db_path)
|
||||
|
||||
print("Database is stale, needs updating...")
|
||||
conn = sqlite3.connect(db_path)
|
||||
print("Creating tables...")
|
||||
create_tables(conn)
|
||||
print("Scanning repository...")
|
||||
parse_commits(conn, repo_dir, max_age)
|
||||
print("Done populating database.")
|
||||
)
|
||||
return conn
|
||||
|
||||
|
||||
def create_tables(conn: sqlite3.Connection):
|
||||
def create_tables(conn: psycopg2.extensions.connection):
|
||||
"""Create database tables if needed."""
|
||||
# TODO: add more attributes as needed
|
||||
# TODO: add all projects as columns
|
||||
print("Creating tables as needed...")
|
||||
cur = conn.cursor()
|
||||
# mod_<project> column: files in the subfolder (=project) <project> were
|
||||
# modified by this commit.
|
||||
conn.execute(
|
||||
""" CREATE TABLE IF NOT EXISTS commits (
|
||||
hash string PRIMARY KEY,
|
||||
commit_time integer,
|
||||
phab_id string,
|
||||
reverts_hash string,
|
||||
# git hashes are 40 characters long, so using char(40) data type here
|
||||
cur.execute(
|
||||
""" CREATE TABLE IF NOT EXISTS git_commits (
|
||||
hash char(40) PRIMARY KEY,
|
||||
commit_time timestamp,
|
||||
phab_id text,
|
||||
reverts_hash char(40),
|
||||
mod_llvm boolean,
|
||||
mod_clang boolean,
|
||||
mod_libcxx boolean,
|
||||
|
@ -63,7 +52,16 @@ def create_tables(conn: sqlite3.Connection):
|
|||
conn.commit()
|
||||
|
||||
|
||||
def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.datetime):
|
||||
def get_existing_hashes(conn: psycopg2.extensions.connection) -> Set[str]:
|
||||
"""Fetch all stored git hashes from the database."""
|
||||
print("Fetching known git hashes from the database...")
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT hash from git_commits;")
|
||||
return set((row[0] for row in cur.fetchall()))
|
||||
|
||||
|
||||
def update_repo(repo_dir: str) -> git.Repo:
|
||||
"""Clone or fetch local copy of the git repository."""
|
||||
if os.path.isdir(repo_dir):
|
||||
print("Fetching git repo...")
|
||||
repo = git.Repo(repo_dir)
|
||||
|
@ -73,25 +71,40 @@ def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.dat
|
|||
git.Repo.clone_from(GIT_URL, repo_dir, bare=True)
|
||||
repo = git.Repo(repo_dir)
|
||||
print("repo update done.")
|
||||
sql_insert_commit = """ INSERT INTO
|
||||
commits (hash, commit_time, phab_id, reverts_hash)
|
||||
values (?,?,?,?);
|
||||
"""
|
||||
sql_update_commit_project = """ UPDATE commits SET mod_{} = ? where hash = ?;"""
|
||||
return repo
|
||||
|
||||
|
||||
def parse_commits(
|
||||
conn: psycopg2.extensions.connection, repo: git.Repo, max_age: datetime.datetime
|
||||
):
|
||||
"""Parse the git repo history and upload it to the database."""
|
||||
|
||||
sql_insert_commit = """ INSERT INTO
|
||||
git_commits (hash, commit_time, phab_id, reverts_hash)
|
||||
values (%s,%s,%s,%s);
|
||||
"""
|
||||
sql_update_commit_project = (
|
||||
""" UPDATE git_commits SET mod_{} = %s where hash = %s;"""
|
||||
)
|
||||
known_hashes = get_existing_hashes(conn)
|
||||
day = None
|
||||
cur = conn.cursor()
|
||||
for commit in repo.iter_commits(GIT_BRANCH):
|
||||
# TODO: This takes a couple of minutes, maybe try using multithreading
|
||||
|
||||
# Only store new/unknown hashes
|
||||
if commit.hexsha in known_hashes:
|
||||
continue
|
||||
if commit.committed_datetime < max_age:
|
||||
break
|
||||
mycommit = MyCommit(commit)
|
||||
if mycommit.date.day != day:
|
||||
day = mycommit.date.day
|
||||
print(mycommit.date)
|
||||
# take a snapshot commit, nice to see progress while updating the
|
||||
# database
|
||||
day = mycommit.date.day
|
||||
print(mycommit.date)
|
||||
conn.commit()
|
||||
conn.execute(
|
||||
cur.execute(
|
||||
sql_insert_commit,
|
||||
(
|
||||
mycommit.chash,
|
||||
|
@ -104,8 +117,9 @@ def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.dat
|
|||
for project in mycommit.modified_projects:
|
||||
# TODO find a way to make this generic for all projects, maybe user
|
||||
# "ALTER TABLE" to add columns as they appear
|
||||
# TODO: modifying the commited row is expensive, maybe find something faster
|
||||
if project in ["llvm", "libcxx", "mlir", "clang"]:
|
||||
conn.execute(
|
||||
cur.execute(
|
||||
sql_update_commit_project.format(project), (True, mycommit.chash)
|
||||
)
|
||||
conn.commit()
|
||||
|
@ -122,7 +136,7 @@ def create_csv_report(title: str, query: str, output_path: str):
|
|||
writer.writerow(row)
|
||||
|
||||
|
||||
def run_queries(conn: sqlite3.Connection, output_path: str):
|
||||
def run_queries(conn: psycopg2.extensions.connection, output_path: str):
|
||||
print("running queries...")
|
||||
create_csv_report("full_db_dump", "select * from commits;", output_path)
|
||||
|
||||
|
@ -145,6 +159,16 @@ def run_queries(conn: sqlite3.Connection, output_path: str):
|
|||
create_csv_report("all_projects_stats", query, output_path)
|
||||
|
||||
|
||||
def update_comits():
|
||||
"""Update the git commits in the database from the git repository."""
|
||||
repo = update_repo(REPO_DIR)
|
||||
conn = connect_to_db()
|
||||
create_tables(conn)
|
||||
parse_commits(conn, repo, MAX_AGE)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
conn = popolate_db(DB_PATH, REPO_DIR, MAX_AGE)
|
||||
run_queries(conn, OUTPUT_PATH)
|
||||
update_comits()
|
||||
# TODO: add argparse to switch between import and query mode or
|
||||
# move queries to another file
|
||||
# run_queries(conn)
|
||||
|
|
Loading…
Reference in a new issue