1
0
Fork 0

repo_hist_db now using postgres DB

Instead of a local database, this script now imports the data
into the shared postgres database. this way the data can be used
for other queries as well.

feel free to extend the data model if you
need additional columns.
This commit is contained in:
Christian Kühnel 2021-05-10 16:50:51 +02:00 committed by Mikhail Goncharov
parent 668e50298c
commit 134ca4b801

View file

@ -1,12 +1,13 @@
#!/usr/bin/env python3
import os
import sqlite3
import psycopg2
import git
from repo_hist import MyCommit
import datetime
import csv
from typing import Set
DB_PATH = "tmp/git_hist.sqlite"
# TODO: make his path configurable for use on the server
REPO_DIR = "tmp/llvm-project"
GIT_URL = "https://github.com/llvm/llvm-project.git"
GIT_BRANCH = "main"
@ -14,45 +15,33 @@ OUTPUT_PATH = "tmp"
# this was the start of using git as primary repo
MAX_AGE = datetime.datetime(year=2019, month=10, day=1, tzinfo=datetime.timezone.utc)
# Maximum age of the database before we re-create it
DB_UPDATE_INTERVAL = datetime.timedelta(days=1)
def popolate_db(
db_path: str, repo_dir: str, max_age: datetime.datetime
) -> sqlite3.Connection:
# TODO: full scan of the git history is quite slow. Maybe enable incremental
# updates. Only insert commits that are not yet in the database.
if os.path.exists(db_path):
age = datetime.datetime.now() - datetime.datetime.fromtimestamp(
os.path.getmtime(db_path)
def connect_to_db() -> psycopg2.extensions.connection:
"""Connect to the database, return connection object."""
conn = psycopg2.connect(
"host=127.0.0.1 sslmode=disable dbname=stats user={} password={}".format(
os.environ["PGUSER"], os.environ["PGPASSWORD"]
)
if age < DB_UPDATE_INTERVAL:
print("Database is recent enough, using existing one.")
return sqlite3.connect(db_path)
os.remove(db_path)
print("Database is stale, needs updating...")
conn = sqlite3.connect(db_path)
print("Creating tables...")
create_tables(conn)
print("Scanning repository...")
parse_commits(conn, repo_dir, max_age)
print("Done populating database.")
)
return conn
def create_tables(conn: sqlite3.Connection):
def create_tables(conn: psycopg2.extensions.connection):
"""Create database tables if needed."""
# TODO: add more attributes as needed
# TODO: add all projects as columns
print("Creating tables as needed...")
cur = conn.cursor()
# mod_<project> column: files in the subfolder (=project) <project> were
# modified by this commit.
conn.execute(
""" CREATE TABLE IF NOT EXISTS commits (
hash string PRIMARY KEY,
commit_time integer,
phab_id string,
reverts_hash string,
# git hashes are 40 characters long, so using char(40) data type here
cur.execute(
""" CREATE TABLE IF NOT EXISTS git_commits (
hash char(40) PRIMARY KEY,
commit_time timestamp,
phab_id text,
reverts_hash char(40),
mod_llvm boolean,
mod_clang boolean,
mod_libcxx boolean,
@ -63,7 +52,16 @@ def create_tables(conn: sqlite3.Connection):
conn.commit()
def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.datetime):
def get_existing_hashes(conn: psycopg2.extensions.connection) -> Set[str]:
"""Fetch all stored git hashes from the database."""
print("Fetching known git hashes from the database...")
cur = conn.cursor()
cur.execute("SELECT hash from git_commits;")
return set((row[0] for row in cur.fetchall()))
def update_repo(repo_dir: str) -> git.Repo:
"""Clone or fetch local copy of the git repository."""
if os.path.isdir(repo_dir):
print("Fetching git repo...")
repo = git.Repo(repo_dir)
@ -73,25 +71,40 @@ def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.dat
git.Repo.clone_from(GIT_URL, repo_dir, bare=True)
repo = git.Repo(repo_dir)
print("repo update done.")
sql_insert_commit = """ INSERT INTO
commits (hash, commit_time, phab_id, reverts_hash)
values (?,?,?,?);
"""
sql_update_commit_project = """ UPDATE commits SET mod_{} = ? where hash = ?;"""
return repo
def parse_commits(
conn: psycopg2.extensions.connection, repo: git.Repo, max_age: datetime.datetime
):
"""Parse the git repo history and upload it to the database."""
sql_insert_commit = """ INSERT INTO
git_commits (hash, commit_time, phab_id, reverts_hash)
values (%s,%s,%s,%s);
"""
sql_update_commit_project = (
""" UPDATE git_commits SET mod_{} = %s where hash = %s;"""
)
known_hashes = get_existing_hashes(conn)
day = None
cur = conn.cursor()
for commit in repo.iter_commits(GIT_BRANCH):
# TODO: This takes a couple of minutes, maybe try using multithreading
# Only store new/unknown hashes
if commit.hexsha in known_hashes:
continue
if commit.committed_datetime < max_age:
break
mycommit = MyCommit(commit)
if mycommit.date.day != day:
day = mycommit.date.day
print(mycommit.date)
# take a snapshot commit, nice to see progress while updating the
# database
day = mycommit.date.day
print(mycommit.date)
conn.commit()
conn.execute(
cur.execute(
sql_insert_commit,
(
mycommit.chash,
@ -104,8 +117,9 @@ def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.dat
for project in mycommit.modified_projects:
# TODO find a way to make this generic for all projects, maybe user
# "ALTER TABLE" to add columns as they appear
# TODO: modifying the commited row is expensive, maybe find something faster
if project in ["llvm", "libcxx", "mlir", "clang"]:
conn.execute(
cur.execute(
sql_update_commit_project.format(project), (True, mycommit.chash)
)
conn.commit()
@ -122,7 +136,7 @@ def create_csv_report(title: str, query: str, output_path: str):
writer.writerow(row)
def run_queries(conn: sqlite3.Connection, output_path: str):
def run_queries(conn: psycopg2.extensions.connection, output_path: str):
print("running queries...")
create_csv_report("full_db_dump", "select * from commits;", output_path)
@ -145,6 +159,16 @@ def run_queries(conn: sqlite3.Connection, output_path: str):
create_csv_report("all_projects_stats", query, output_path)
def update_comits():
"""Update the git commits in the database from the git repository."""
repo = update_repo(REPO_DIR)
conn = connect_to_db()
create_tables(conn)
parse_commits(conn, repo, MAX_AGE)
if __name__ == "__main__":
conn = popolate_db(DB_PATH, REPO_DIR, MAX_AGE)
run_queries(conn, OUTPUT_PATH)
update_comits()
# TODO: add argparse to switch between import and query mode or
# move queries to another file
# run_queries(conn)