137 lines
4.5 KiB
Python
Executable file
137 lines
4.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
import os
|
|
import sqlite3
|
|
import git
|
|
from repo_hist import MyCommit
|
|
import datetime
|
|
|
|
DB_PATH = "tmp/git_hist.sqlite"
|
|
REPO_DIR = "tmp/llvm-project"
|
|
GIT_URL = "https://github.com/llvm/llvm-project.git"
|
|
GIT_BRANCH = "main"
|
|
# this was the start of using git as primary repo
|
|
MAX_AGE = datetime.datetime(year=2019, month=10, day=1, tzinfo=datetime.timezone.utc)
|
|
# Maximum age of the database before we re-create it
|
|
DB_UPDATE_INTERVAL = datetime.timedelta(days=1)
|
|
|
|
|
|
def popolate_db(
|
|
db_path: str, repo_dir: str, max_age: datetime.datetime
|
|
) -> sqlite3.Connection:
|
|
# TODO: full scan of the git history is quite slow. Maybe enable incremental
|
|
# updates. Only insert commits that are not yet in the database.
|
|
if os.path.exists(db_path):
|
|
age = datetime.datetime.now() - datetime.datetime.fromtimestamp(
|
|
os.path.getmtime(db_path)
|
|
)
|
|
if age < DB_UPDATE_INTERVAL:
|
|
print("Database is recent enough, using existing one.")
|
|
return sqlite3.connect(db_path)
|
|
os.remove(db_path)
|
|
|
|
print("Database is stale, needs updating...")
|
|
conn = sqlite3.connect(db_path)
|
|
print("Creating tables...")
|
|
create_tables(conn)
|
|
print("Creating indexes...")
|
|
create_indexes(conn)
|
|
print("Scanning repository...")
|
|
parse_commits(conn, repo_dir, max_age)
|
|
print("Done populating database.")
|
|
return conn
|
|
|
|
|
|
def create_tables(conn: sqlite3.Connection):
|
|
# TODO: add more attributes as needed
|
|
conn.execute(
|
|
""" CREATE TABLE IF NOT EXISTS commits (
|
|
hash string PRIMARY KEY,
|
|
commit_time integer,
|
|
phab_id string,
|
|
reverts_hash string
|
|
); """
|
|
)
|
|
# Normalized representation of modified projects per commit.
|
|
conn.execute(
|
|
""" CREATE TABLE IF NOT EXISTS commit_project (
|
|
project string,
|
|
hash string,
|
|
FOREIGN KEY (hash) REFERENCES commits(hash)
|
|
);"""
|
|
)
|
|
|
|
conn.commit()
|
|
|
|
|
|
def create_indexes(conn: sqlite3.Connection):
|
|
"""Indexes to speed up searches and joins."""
|
|
conn.execute(
|
|
""" CREATE INDEX commit_project_hash
|
|
ON commit_project(hash);"""
|
|
)
|
|
conn.execute(
|
|
""" CREATE INDEX commit_project_project
|
|
ON commit_project(project);"""
|
|
)
|
|
conn.commit()
|
|
|
|
|
|
def parse_commits(conn: sqlite3.Connection, repo_dir: str, max_age: datetime.datetime):
|
|
if os.path.isdir(repo_dir):
|
|
print("Fetching git repo...")
|
|
repo = git.Repo(repo_dir)
|
|
repo.remotes.origin.fetch(GIT_BRANCH)
|
|
else:
|
|
print("Cloning git repo...")
|
|
git.Repo.clone_from(GIT_URL, repo_dir, bare=True)
|
|
repo = git.Repo(repo_dir)
|
|
print("repo update done.")
|
|
sql_insert_commit = """ INSERT INTO
|
|
commits (hash, commit_time, phab_id, reverts_hash)
|
|
values (?,?,?,?);
|
|
"""
|
|
sql_insert_commit_project = """ INSERT INTO
|
|
commit_project (hash, project)
|
|
values (?,?);
|
|
"""
|
|
day = None
|
|
for commit in repo.iter_commits(GIT_BRANCH):
|
|
# TODO: This takes a couple of minutes, maybe try using multithreading
|
|
if commit.committed_datetime < max_age:
|
|
break
|
|
mycommit = MyCommit(commit)
|
|
if mycommit.date.day != day:
|
|
day = mycommit.date.day
|
|
print(mycommit.date)
|
|
# take a snapshot commit, nice to see progress while updating the
|
|
# database
|
|
conn.commit()
|
|
conn.execute(
|
|
sql_insert_commit,
|
|
(
|
|
mycommit.chash,
|
|
mycommit.date,
|
|
mycommit.phab_revision,
|
|
mycommit.reverts_commit_hash,
|
|
),
|
|
)
|
|
# Note: prasing the patches is quite slow
|
|
for project in mycommit.modified_projects:
|
|
conn.execute(sql_insert_commit_project, (mycommit.chash, project))
|
|
conn.commit()
|
|
|
|
|
|
def run_queries(conn: sqlite3.Connection):
|
|
query = """SELECT commits.hash, commits.phab_id, commits.commit_time
|
|
FROM commits
|
|
INNER JOIN commit_project ON commits.hash = commit_project.hash
|
|
WHERE commit_project.project="libcxx";"""
|
|
cursor = conn.cursor()
|
|
data = cursor.execute(query)
|
|
for row in data:
|
|
print(row)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
conn = popolate_db(DB_PATH, REPO_DIR, MAX_AGE)
|
|
run_queries(conn)
|