#!/usr/bin/env python3
# Copyright 2019 Google LLC
# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     https://llvm.org/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

# Get data on Revisions and builds from Phabricator

import phabricator
import json
import os
import datetime
from typing import Dict, List, Optional
import csv
import time
import socket
import git
import argparse

# PHIDs of build plans used for pre-merge testing
# FIXME: how do you get these?
_PRE_MERGE_PHIDs = ['PHID-HMCP-bfkbtacsszhg3feydpo6',  # beta testers
                    'PHID-HMCP-qbviyekvgirhhhvkzpsn',   # public pre-merge tests

# query all data after this date
START_DATE = datetime.date(year=2019, month=10, day=1)

class PhabResponse:

    def __init__(self, revision_dict: Dict):
        self.revision_dict = revision_dict

    def id(self) -> str:
        return self.revision_dict['id']

    def phid(self) -> str:
        return self.revision_dict['phid']

    def __str__(self):
        return str(self.revision_dict)

class Revision(PhabResponse):

    def __init__(self, revision_dict):
        self.buildables = []  # type: List['Buildable']
        self.diffs = []  # type: List['Diff']

    def status(self) -> str:
        return self.revision_dict['fields']['status']['value']

    def builds(self) -> List['Build']:
        builds = []
        for b in self.buildables:
        return builds

    def created_date(self):
        return self.revision_dict['fields']['dateCreated']

    def was_premerge_tested(self) -> bool:
        return any((b.was_premerge_tested for b in self.builds))

    def repository_phid(self) -> str:
        return self.revision_dict['fields']['repositoryPHID']

    def diff_phid(self) -> str:
        return self.revision_dict['fields']['diffPHID']

    def all_diffs_have_refs(self) -> bool:
        return not any(not d.has_refs for d in self.diffs)

    def day(self) -> datetime.date:
        return datetime.date.fromtimestamp(self.created_date)

    def week(self) -> str:
        day = self.day
        return'{}-w{:02d}'.format(day.year, day.isocalendar()[1])

    def published(self) -> bool:
        return self.status == 'published'

    def build_status_list(self) -> List[bool]:
        return [b.passed for b in self.builds if b.was_premerge_tested]

    def builds_finally_succeeded(self) -> bool:
        """Return true iff one of the builds failed and the last build passed."""
        return self.has_failed_builds and self.build_status_list[-1]

    def has_failed_builds(self) -> bool:
        """Return true iff one of the builds failed."""
        return False in self.build_status_list

    def published_failing(self) -> bool:
        """Return true iff published and the last build failed."""
        return self.was_premerge_tested and self.published \
               and not self.build_status_list[-1]

    def all_builds_passed(self) -> bool:
        return self.was_premerge_tested and all(self.build_status_list)

    def all_builds_failed(self) -> bool:
        return self.was_premerge_tested and all(not b for b in self.build_status_list)

    def dateModified(self) -> datetime.datetime:
        return datetime.datetime.fromtimestamp(self.revision_dict['fields']['dateModified'])

class Buildable(PhabResponse):

    def __init__(self, revision_dict):
        self.builds = []  # type: List[Build]
        self.revision = None  # type: Optional[Revision]

    def diff_phid(self) -> str:
        return self.revision_dict['buildablePHID']

    def revison_phid(self) -> str:
        return self.revision_dict['containerPHID']

class Build(PhabResponse):

    def __init__(self, revision_dict):
        self.buildable = None  # type: Optional[Buildable]

    def buildable_phid(self) -> str:
        return self.revision_dict['fields']['buildablePHID']

    def buildplan_phid(self) -> str:
        return self.revision_dict['fields']['buildPlanPHID']

    def was_premerge_tested(self) -> bool:
        result = self.buildplan_phid in _PRE_MERGE_PHIDs
        return result

    def passed(self) -> bool:
        """Returns true, if the build "passed" """
        return self.was_premerge_tested and self.revision_dict['fields']['buildStatus']['value'] == 'passed'

    def dateModified(self) ->datetime.datetime:
        return datetime.datetime.fromtimestamp(self.revision_dict['fields']['dateModified'])

class Diff(PhabResponse):

    def __init__(self, revision_dict):
        self.revision = None  # type: Optional[Revision]

    def revison_phid(self) -> str:
        return self.revision_dict['fields']['revisionPHID']

    def _refs(self) -> List:
        return self.revision_dict['fields']['refs']

    def has_refs(self) -> bool:
        return len(self._refs) > 0

    def base_revision(self) -> str:
        for ref in self._refs:
            if ref['type'] == 'base':
                return ref['identifier']
        return None

    def base_branch(self) -> str:
        for ref in self._refs:
            if ref['type'] == 'branch':
                return ref['name']
        return None

    def dateCreated(self) -> datetime.datetime:
        return datetime.datetime.fromtimestamp(self.revision_dict['fields']['dateCreated'])

class PhabBuildPuller:
    # files/folder for sotring temporary results
    _TMP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'tmp'))
    _REVISION_FILE = os.path.join(_TMP_DIR, 'phab-revisions.json')
    _BUILDABLE_FILE = os.path.join(_TMP_DIR, 'phab-buildables.json')
    _BUILD_FILE = os.path.join(_TMP_DIR, 'phab-build.json')
    _DIFF_FILE = os.path.join(_TMP_DIR, 'phab-diffs.json')
    _PHAB_WEEKLY_METRICS_FILE = os.path.join(_TMP_DIR, 'phabricator_{}.csv')

    def __init__(self, repo_path: str):
        self.conduit_token = None
        self.host = None
        self.phab = self._create_phab()
        self._repo_path = repo_path  # type: str
        self.revisions = {}  # type: Dict[str, Revision]
        self.buildables = {}  # type: Dict[str, Buildable]
        self.builds = {}  # type: Dict[str, Build]
        self.diffs = {}  # type: Dict[str, Diff]

    def _create_phab(self) -> phabricator.Phabricator:
        phab = phabricator.Phabricator(token=self.conduit_token, host=self.host)
        return phab

    def _load_arcrc(self):
        """Load arc configuration from file if not set."""
        if self.conduit_token is not None or self.host is not None:
        print('Loading configuration from ~/.arcrc file')
        with open(os.path.expanduser('~/.arcrc'), 'r') as arcrc_file:
            arcrc = json.load(arcrc_file)
        # use the first host configured in the file
        self.host = next(iter(arcrc['hosts']))
        self.conduit_token = arcrc['hosts'][self.host]['token']

    def run(self):
        if not os.path.exists(self._TMP_DIR):
        if not os.path.isfile(self._REVISION_FILE):
        if not os.path.isfile(self._BUILDABLE_FILE):
        if not os.path.isfile(self._BUILD_FILE):
        if not os.path.isfile(self._DIFF_FILE):
        self.compute_metrics('day', lambda r: r.day)
        self.compute_metrics('week', lambda r: r.week)

    def get_revisions(self):
        print('Downloading revisions starting...')
        from_date = int(START_DATE.strftime('%s'))
        data = []
        constraints = {
            'createdStart': from_date
        # FIXME: lots of code duplication around pagination and error handling.
        # find a way to separate this into a function.
        after = None
        while True:
            revisions = self.phab.differential.revision.search(
                constraints=constraints, after=after)
            print('{} revisions...'.format(len(data)))
            after = revisions.response['cursor']['after']
            if after is None:
        print('Number of revisions:', len(data))
        with open(self._REVISION_FILE, 'w') as json_file:
            json.dump(data, json_file)

    def get_buildables(self):
        print('Downloading buildables...')
        data = []
        after = None
        while True:
            revisions = self.phab.harbormaster.querybuildables(
                containerPHIDs=[r.phid for r in self.revisions.values()], after=after)
            print('{} buildables...'.format(len(data)))
            after = revisions.response['cursor']['after']
            if after is None:
        print('Number of buildables:', len(data))
        with open(self._BUILDABLE_FILE, 'w') as json_file:
            json.dump(data, json_file)

    def get_builds(self):
        print('Downloading builds...')
        data = []
        constraints = {
            'buildables': [r.phid for r in self.buildables.values()]
        after = None
        while True:
            # retry on timeouts
            fail_count = 0
            while True:
                    revisions = self.phab.harbormaster.build.search(
                        constraints=constraints, after=after)
                except socket.timeout:
                    fail_count +=1
                    if fail_count > 5:
            print('{} builds...'.format(len(data)))
            after = revisions.response['cursor']['after']
            if after is None:
        print('Number of buildables:', len(data))
        with open(self._BUILD_FILE, 'w') as json_file:
            json.dump(data, json_file)

    def get_diffs(self):
        print('Downloading diffs...')
        data = []
        constraints = {
            'revisionPHIDs': [r.phid for r in self.revisions.values()]
        after = None
        while True:
            # retry on timeouts
            fail_count = 0
            while True:
                    diffs = self.phab.differential.diff.search(
                        constraints=constraints, after=after)
                except socket.timeout:
                    fail_count +=1
                    if fail_count > 5:
            print('{} diffs...'.format(len(data)))
            after = diffs.response['cursor']['after']
            if after is None:
        print('Number of diffs:', len(data))
        with open(self._DIFF_FILE, 'w') as json_file:
            json.dump(data, json_file)

    def parse_revisions(self):
        with open(self._REVISION_FILE) as revision_file:
            revision_dict = json.load(revision_file)
        self.revisions = {r.phid: r for r in (Revision(x) for x in revision_dict)}
        print('Parsed {} revisions.'.format(len(self.revisions)))

    def parse_buildables(self):
        with open(self._BUILDABLE_FILE) as buildables_file:
            buildable_dict = json.load(buildables_file)
        self.buildables = {b.phid: b for b in (Buildable(x) for x in buildable_dict)}
        print('Parsed {} buildables.'.format(len(self.buildables)))

    def parse_builds(self):
        with open(self._BUILD_FILE) as build_file:
            build_dict = json.load(build_file)
        self.builds = {b.phid: b for b in (Build(x) for x in build_dict)}
        print('Parsed {} builds.'.format(len(self.builds)))

    def parse_diffs(self):
        with open(self._DIFF_FILE) as diff_file:
            diff_dict = json.load(diff_file)
        self.diffs = {d.phid: d for d in (Diff(x) for x in diff_dict)}
        print('Parsed {} diffs.'.format(len(self.diffs)))

    def link_objects(self):
        for build in (b for b in self.builds.values()):
            buildable = self.buildables[build.buildable_phid]
            build.buildable = buildable

        for buildable in self.buildables.values():
            revision = self.revisions[buildable.revison_phid]
            buildable.revision = revision

        for diff in self.diffs.values():
            revision = self.revisions[diff.revison_phid]
            diff.revision = revision

    def compute_metrics(self, name: str, group_function):
        print('Creating metrics for {}...'.format(name))
        group_dict = {}
        for revision in self.revisions.values():
            group_dict.setdefault(group_function(revision), []).append(revision)

        csv_file = open(self._PHAB_WEEKLY_METRICS_FILE.format(name), 'w')
        fieldnames = [name, '# revisions', '# tested revisions', '% tested revisions', '# untested revisions',
                      '# revisions without builds', '% revisions without builds', '# no repository set', 
                      '# had failed builds', '% had failed builds', '# failed first then passed', 
                      '% failed first then passed', '# published failing', '% published failing',
                        '# all passed', '% all passed']

        writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=csv.excel)
        for group in sorted(group_dict.keys()):
            revisions = group_dict[group]  # type: List[Revision]
            num_revisions = len(revisions)
            num_premt_revisions = len([r for r in revisions if r.was_premerge_tested])
            precentage_premt_revisions = 100.0 * num_premt_revisions / num_revisions
            num_no_build_triggered = len([r for r in revisions if len(r.builds) == 0])
            percent_no_build_triggered = 100.0 * num_no_build_triggered / num_revisions
            num_no_repo = len([r for r in revisions if r.repository_phid is None])
            num_had_failed_builds =len([r for r in revisions if r.has_failed_builds])
            num_failed_first_then_passed = len([r for r in revisions if r.builds_finally_succeeded])
            num_published_failing = len([r for r in revisions if r.published_failing])
            num_all_passed = len([r for r in revisions if r.all_builds_passed])
                name: group,
                '# revisions': num_revisions,
                '# tested revisions': num_premt_revisions,
                '% tested revisions': precentage_premt_revisions,
                '# untested revisions': num_revisions - num_premt_revisions,
                '# revisions without builds': num_no_build_triggered,
                '% revisions without builds': percent_no_build_triggered,
                '# no repository set': num_no_repo,
                '# had failed builds': num_had_failed_builds,
                '% had failed builds': 100 * num_had_failed_builds / num_revisions,
                '# failed first then passed': num_failed_first_then_passed,
                '% failed first then passed': 100 * num_failed_first_then_passed / num_revisions,
                '# published failing': num_published_failing,
                '% published failing': 100 * num_published_failing / num_revisions,
                '# all passed': num_all_passed,
                '% all passed': 100*num_all_passed / num_revisions,

    def count_base_revisions(self):
        base_revisions = {}
        base_branches = {}
        for diff in self.diffs.values():
            base_revisions.setdefault(diff.base_revision, 0)
            base_revisions[diff.base_revision] += 1

            base_branches.setdefault(diff.base_branch, 0) 
            base_branches[diff.base_branch] +=1
        print(f'{len(self.diffs)} diffs are using {len(base_revisions)} different git base revisions.')
        print('The top 10 revisions and their usages are:')
        revisions = sorted( base_revisions.items(), key=lambda x: x[1] , reverse=True)
        for i in revisions[:10]:
            print(f'  commit {i[0]} was used {i[1]} times')
        print(f'{len(self.diffs)} diffs are using {len(base_branches)} different git base branches')
        branches = sorted( base_branches.items(), key=lambda x: x[1] , reverse=True)
        print('The top 10 branches and their usages are:')
        for i in branches[:10]:
            print(f'  branch {i[0]} was used {i[1]} times')

    def match_base_revisions_with_repo(self, repo_path: str):
        repo = git.Repo(repo_path)
        not_found = 0
        invalid_date = 0
        has_base_revision = 0
        for diff in self.diffs.values():
            revision = diff.base_revision
            if revision is None:
            has_base_revision += 1
                commit = repo.commit(revision)
            except (ValueError, git.BadName):
                not_found += 1
            commited_date = datetime.datetime.fromtimestamp(commit.committed_date)
            if commited_date > diff.dateCreated:
                invalid_date += 1
        print(f'Of the {has_base_revision} Diffs with base revision, the base revision was NOT found in the repo for {not_found} and ')
        print(f'{invalid_date} base revisions were used before being available upstream.')
        print(f'So {(not_found+invalid_date)/has_base_revision*100:0.2f} % of specified the base revisions were unusable.')

    def revision_statistics(self):
        no_builds = 0
        has_failed = 0
        fail_then_pass = 0
        all_passed = 0
        fail_last = 0
        for revision in self.revisions.values():
            build_status = [b.passed for b in revision.builds]
            if len(revision.builds) == 0:
                no_builds += 1
            if False in build_status:
                has_failed += 1
                if build_status[-1] == True:
                    fail_then_pass +=1
                all_passed += 1
            if revision.published and build_status[-1] == False:
                fail_last += 1
        print(f'Out of the {len(self.revisions)} Revisions:')
        print(f'   {no_builds} had no builds.')
        print(f'   {has_failed} had failed builds.')
        print(f'   {fail_then_pass} had failed builds, but the last build passed.')
        print(f'   {all_passed} had only successful builds.')
        print(f'   {fail_last} were published with a failing build.')

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    args = parser.parse_args()
    puller = PhabBuildPuller(args.repo_path)