#!/usr/bin/env python3
# Copyright 2019 Google LLC
#
# Licensed under the the Apache License v2.0 with LLVM Exceptions (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://llvm.org/LICENSE.txt
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Get data on Revisions and builds from Phabricator

import phabricator
import json
import os
import datetime
from typing import Dict, List, Optional
import csv
import time
import socket
import git
import argparse

# PHIDs of build plans used for pre-merge testing
# FIXME: how do you get these?
_PRE_MERGE_PHIDs = ['PHID-HMCP-bfkbtacsszhg3feydpo6',  # beta testers
                    'PHID-HMCP-qbviyekvgirhhhvkzpsn',   # public pre-merge tests
                    'PHID-HMCP-p2oc4ocen3l2yzymvg2l',
                    ]

# query all data after this date
START_DATE = datetime.date(year=2019, month=10, day=1)

class PhabResponse:

    def __init__(self, revision_dict: Dict):
        self.revision_dict = revision_dict

    @property
    def id(self) -> str:
        return self.revision_dict['id']

    @property
    def phid(self) -> str:
        return self.revision_dict['phid']

    def __str__(self):
        return str(self.revision_dict)


class Revision(PhabResponse):

    def __init__(self, revision_dict):
        super().__init__(revision_dict)
        self.buildables = []  # type: List['Buildable']
        self.diffs = []  # type: List['Diff']

    @property
    def status(self) -> str:
        return self.revision_dict['fields']['status']['value']

    @property
    def builds(self) -> List['Build']:
        builds = []
        for b in self.buildables:
            builds.extend(b.builds)
        return builds

    @property
    def created_date(self):
        return self.revision_dict['fields']['dateCreated']

    @property
    def was_premerge_tested(self) -> bool:
        return any((b.was_premerge_tested for b in self.builds))

    @property
    def repository_phid(self) -> str:
        return self.revision_dict['fields']['repositoryPHID']

    @property
    def diff_phid(self) -> str:
        return self.revision_dict['fields']['diffPHID']

    @property
    def all_diffs_have_refs(self) -> bool:
        return not any(not d.has_refs for d in self.diffs)

    @property
    def day(self) -> datetime.date:
        return datetime.date.fromtimestamp(self.created_date)

    @property
    def week(self) -> str:
        day = self.day
        return'{}-w{:02d}'.format(day.year, day.isocalendar()[1])

    @property
    def published(self) -> bool:
        return self.status == 'published'

    @property
    def build_status_list(self) -> List[bool]:
        return [b.passed for b in self.builds if b.was_premerge_tested]

    @property
    def builds_finally_succeeded(self) -> bool:
        """Return true iff one of the builds failed and the last build passed."""
        return self.has_failed_builds and self.build_status_list[-1]

    @property
    def has_failed_builds(self) -> bool:
        """Return true iff one of the builds failed."""
        return False in self.build_status_list

    @property
    def published_failing(self) -> bool:
        """Return true iff published and the last build failed."""
        return self.was_premerge_tested and self.published \
               and not self.build_status_list[-1]

    @property
    def all_builds_passed(self) -> bool:
        return self.was_premerge_tested and all(self.build_status_list)

    @property
    def all_builds_failed(self) -> bool:
        return self.was_premerge_tested and all(not b for b in self.build_status_list)

    @property
    def dateModified(self) -> datetime.datetime:
        return datetime.datetime.fromtimestamp(self.revision_dict['fields']['dateModified'])
       

class Buildable(PhabResponse):

    def __init__(self, revision_dict):
        super().__init__(revision_dict)
        self.builds = []  # type: List[Build]
        self.revision = None  # type: Optional[Revision]

    @property
    def diff_phid(self) -> str:
        return self.revision_dict['buildablePHID']

    @property
    def revison_phid(self) -> str:
        return self.revision_dict['containerPHID']


class Build(PhabResponse):

    def __init__(self, revision_dict):
        super().__init__(revision_dict)
        self.buildable = None  # type: Optional[Buildable]

    @property
    def buildable_phid(self) -> str:
        return self.revision_dict['fields']['buildablePHID']

    @property
    def buildplan_phid(self) -> str:
        return self.revision_dict['fields']['buildPlanPHID']

    @property
    def was_premerge_tested(self) -> bool:
        result = self.buildplan_phid in _PRE_MERGE_PHIDs
        return result

    @property
    def passed(self) -> bool:
        """Returns true, if the build "passed" """
        return self.was_premerge_tested and self.revision_dict['fields']['buildStatus']['value'] == 'passed'

    @property
    def dateModified(self) ->datetime.datetime:
        return datetime.datetime.fromtimestamp(self.revision_dict['fields']['dateModified'])

class Diff(PhabResponse):

    def __init__(self, revision_dict):
        super().__init__(revision_dict)
        self.revision = None  # type: Optional[Revision]

    @property
    def revison_phid(self) -> str:
        return self.revision_dict['fields']['revisionPHID']

    @property
    def _refs(self) -> List:
        return self.revision_dict['fields']['refs']

    @property
    def has_refs(self) -> bool:
        return len(self._refs) > 0

    @property
    def base_revision(self) -> str:
        for ref in self._refs:
            if ref['type'] == 'base':
                return ref['identifier']
        return None

    @property
    def base_branch(self) -> str:
        for ref in self._refs:
            if ref['type'] == 'branch':
                return ref['name']
        return None

    @property
    def dateCreated(self) -> datetime.datetime:
        return datetime.datetime.fromtimestamp(self.revision_dict['fields']['dateCreated'])

class PhabBuildPuller:
    # files/folder for sotring temporary results
    _TMP_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'tmp'))
    _REVISION_FILE = os.path.join(_TMP_DIR, 'phab-revisions.json')
    _BUILDABLE_FILE = os.path.join(_TMP_DIR, 'phab-buildables.json')
    _BUILD_FILE = os.path.join(_TMP_DIR, 'phab-build.json')
    _DIFF_FILE = os.path.join(_TMP_DIR, 'phab-diffs.json')
    _PHAB_WEEKLY_METRICS_FILE = os.path.join(_TMP_DIR, 'phabricator_{}.csv')

    def __init__(self, repo_path: str):
        self.conduit_token = None
        self.host = None
        self.phab = self._create_phab()
        self._repo_path = repo_path  # type: str
        self.revisions = {}  # type: Dict[str, Revision]
        self.buildables = {}  # type: Dict[str, Buildable]
        self.builds = {}  # type: Dict[str, Build]
        self.diffs = {}  # type: Dict[str, Diff]

    def _create_phab(self) -> phabricator.Phabricator:
        phab = phabricator.Phabricator(token=self.conduit_token, host=self.host)
        phab.update_interfaces()
        return phab

    def _load_arcrc(self):
        """Load arc configuration from file if not set."""
        if self.conduit_token is not None or self.host is not None:
            return
        print('Loading configuration from ~/.arcrc file')
        with open(os.path.expanduser('~/.arcrc'), 'r') as arcrc_file:
            arcrc = json.load(arcrc_file)
        # use the first host configured in the file
        self.host = next(iter(arcrc['hosts']))
        self.conduit_token = arcrc['hosts'][self.host]['token']

    def run(self):
        if not os.path.exists(self._TMP_DIR):
            os.mkdir(self._TMP_DIR)
        if not os.path.isfile(self._REVISION_FILE):
            self.get_revisions()
        self.parse_revisions()
        if not os.path.isfile(self._BUILDABLE_FILE):
            self.get_buildables()
        self.parse_buildables()
        if not os.path.isfile(self._BUILD_FILE):
            self.get_builds()
        self.parse_builds()
        if not os.path.isfile(self._DIFF_FILE):
            self.get_diffs()
        self.parse_diffs()
        self.link_objects()
        self.compute_metrics('day', lambda r: r.day)
        self.compute_metrics('week', lambda r: r.week)
        self.count_base_revisions()
        self.revision_statistics()
        self.match_base_revisions_with_repo(self._repo_path)

    def get_revisions(self):
        print('Downloading revisions starting...')
        from_date = int(START_DATE.strftime('%s'))
        data = []
        constraints = {
            'createdStart': from_date
        }
        # FIXME: lots of code duplication around pagination and error handling.
        # find a way to separate this into a function.
        after = None
        while True:
            revisions = self.phab.differential.revision.search(
                constraints=constraints, after=after)
            data.extend(revisions.response['data'])
            print('{} revisions...'.format(len(data)))
            after = revisions.response['cursor']['after']
            if after is None:
                break
        print('Number of revisions:', len(data))
        with open(self._REVISION_FILE, 'w') as json_file:
            json.dump(data, json_file)

    def get_buildables(self):
        print('Downloading buildables...')
        data = []
        after = None
        while True:
            revisions = self.phab.harbormaster.querybuildables(
                containerPHIDs=[r.phid for r in self.revisions.values()], after=after)
            data.extend(revisions.response['data'])
            print('{} buildables...'.format(len(data)))
            after = revisions.response['cursor']['after']
            if after is None:
                break
        print('Number of buildables:', len(data))
        with open(self._BUILDABLE_FILE, 'w') as json_file:
            json.dump(data, json_file)

    def get_builds(self):
        print('Downloading builds...')
        data = []
        constraints = {
            'buildables': [r.phid for r in self.buildables.values()]
        }
        after = None
        while True:
            # retry on timeouts
            fail_count = 0
            while True:
                try: 
                    revisions = self.phab.harbormaster.build.search(
                        constraints=constraints, after=after)
                except socket.timeout:
                    fail_count +=1
                    if fail_count > 5:
                        raise
                    time.sleep(10)
                    continue
                break
            data.extend(revisions.response['data'])
            print('{} builds...'.format(len(data)))
            after = revisions.response['cursor']['after']
            if after is None:
                break
        print('Number of buildables:', len(data))
        with open(self._BUILD_FILE, 'w') as json_file:
            json.dump(data, json_file)

    def get_diffs(self):
        print('Downloading diffs...')
        data = []
        constraints = {
            'revisionPHIDs': [r.phid for r in self.revisions.values()]
        }
        after = None
        while True:
            # retry on timeouts
            fail_count = 0
            while True:
                try: 
                    diffs = self.phab.differential.diff.search(
                        constraints=constraints, after=after)
                except socket.timeout:
                    fail_count +=1
                    if fail_count > 5:
                        raise
                    time.sleep(10)
                    continue
                break
            data.extend(diffs.response['data'])
            print('{} diffs...'.format(len(data)))
            after = diffs.response['cursor']['after']
            if after is None:
                break
        print('Number of diffs:', len(data))
        with open(self._DIFF_FILE, 'w') as json_file:
            json.dump(data, json_file)

    def parse_revisions(self):
        with open(self._REVISION_FILE) as revision_file:
            revision_dict = json.load(revision_file)
        self.revisions = {r.phid: r for r in (Revision(x) for x in revision_dict)}
        print('Parsed {} revisions.'.format(len(self.revisions)))

    def parse_buildables(self):
        with open(self._BUILDABLE_FILE) as buildables_file:
            buildable_dict = json.load(buildables_file)
        self.buildables = {b.phid: b for b in (Buildable(x) for x in buildable_dict)}
        print('Parsed {} buildables.'.format(len(self.buildables)))

    def parse_builds(self):
        with open(self._BUILD_FILE) as build_file:
            build_dict = json.load(build_file)
        self.builds = {b.phid: b for b in (Build(x) for x in build_dict)}
        print('Parsed {} builds.'.format(len(self.builds)))

    def parse_diffs(self):
        with open(self._DIFF_FILE) as diff_file:
            diff_dict = json.load(diff_file)
        self.diffs = {d.phid: d for d in (Diff(x) for x in diff_dict)}
        print('Parsed {} diffs.'.format(len(self.diffs)))

    def link_objects(self):
        for build in (b for b in self.builds.values()):
            buildable = self.buildables[build.buildable_phid]
            build.buildable = buildable
            buildable.builds.append(build)

        for buildable in self.buildables.values():
            revision = self.revisions[buildable.revison_phid]
            revision.buildables.append(buildable)
            buildable.revision = revision

        for diff in self.diffs.values():
            revision = self.revisions[diff.revison_phid]
            revision.diffs.append(diff)
            diff.revision = revision

    def compute_metrics(self, name: str, group_function):
        print('Creating metrics for {}...'.format(name))
        group_dict = {}
        for revision in self.revisions.values():
            group_dict.setdefault(group_function(revision), []).append(revision)

        csv_file = open(self._PHAB_WEEKLY_METRICS_FILE.format(name), 'w')
        fieldnames = [name, '# revisions', '# tested revisions', '% tested revisions', '# untested revisions',
                      '# revisions without builds', '% revisions without builds', '# no repository set', 
                      '# had failed builds', '% had failed builds', '# failed first then passed', 
                      '% failed first then passed', '# published failing', '% published failing',
                        '# all passed', '% all passed']

        writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=csv.excel)
        writer.writeheader()
        for group in sorted(group_dict.keys()):
            revisions = group_dict[group]  # type: List[Revision]
            num_revisions = len(revisions)
            num_premt_revisions = len([r for r in revisions if r.was_premerge_tested])
            precentage_premt_revisions = 100.0 * num_premt_revisions / num_revisions
            num_no_build_triggered = len([r for r in revisions if len(r.builds) == 0])
            percent_no_build_triggered = 100.0 * num_no_build_triggered / num_revisions
            num_no_repo = len([r for r in revisions if r.repository_phid is None])
            num_had_failed_builds =len([r for r in revisions if r.has_failed_builds])
            num_failed_first_then_passed = len([r for r in revisions if r.builds_finally_succeeded])
            num_published_failing = len([r for r in revisions if r.published_failing])
            num_all_passed = len([r for r in revisions if r.all_builds_passed])
            writer.writerow({
                name: group,
                '# revisions': num_revisions,
                '# tested revisions': num_premt_revisions,
                '% tested revisions': precentage_premt_revisions,
                '# untested revisions': num_revisions - num_premt_revisions,
                '# revisions without builds': num_no_build_triggered,
                '% revisions without builds': percent_no_build_triggered,
                '# no repository set': num_no_repo,
                '# had failed builds': num_had_failed_builds,
                '% had failed builds': 100 * num_had_failed_builds / num_revisions,
                '# failed first then passed': num_failed_first_then_passed,
                '% failed first then passed': 100 * num_failed_first_then_passed / num_revisions,
                '# published failing': num_published_failing,
                '% published failing': 100 * num_published_failing / num_revisions,
                '# all passed': num_all_passed,
                '% all passed': 100*num_all_passed / num_revisions,
            })

    def count_base_revisions(self):
        base_revisions = {}
        base_branches = {}
        for diff in self.diffs.values():
            base_revisions.setdefault(diff.base_revision, 0)
            base_revisions[diff.base_revision] += 1

            base_branches.setdefault(diff.base_branch, 0) 
            base_branches[diff.base_branch] +=1
        
        print(f'{len(self.diffs)} diffs are using {len(base_revisions)} different git base revisions.')
        print('The top 10 revisions and their usages are:')
        revisions = sorted( base_revisions.items(), key=lambda x: x[1] , reverse=True)
        for i in revisions[:10]:
            print(f'  commit {i[0]} was used {i[1]} times')
        print()
        print(f'{len(self.diffs)} diffs are using {len(base_branches)} different git base branches')
        branches = sorted( base_branches.items(), key=lambda x: x[1] , reverse=True)
        print('The top 10 branches and their usages are:')
        for i in branches[:10]:
            print(f'  branch {i[0]} was used {i[1]} times')
        print()

    def match_base_revisions_with_repo(self, repo_path: str):
        repo = git.Repo(repo_path)
        not_found = 0
        invalid_date = 0
        has_base_revision = 0
        for diff in self.diffs.values():
            revision = diff.base_revision
            if revision is None:
                continue
            has_base_revision += 1
            try:
                commit = repo.commit(revision)
            except (ValueError, git.BadName):
                not_found += 1
                continue
            commited_date = datetime.datetime.fromtimestamp(commit.committed_date)
            if commited_date > diff.dateCreated:
                invalid_date += 1
        
        print()
        print(f'Of the {has_base_revision} Diffs with base revision, the base revision was NOT found in the repo for {not_found} and ')
        print(f'{invalid_date} base revisions were used before being available upstream.')
        print(f'So {(not_found+invalid_date)/has_base_revision*100:0.2f} % of specified the base revisions were unusable.')

    def revision_statistics(self):
        no_builds = 0
        has_failed = 0
        fail_then_pass = 0
        all_passed = 0
        fail_last = 0
        for revision in self.revisions.values():
            build_status = [b.passed for b in revision.builds]
            if len(revision.builds) == 0:
                no_builds += 1
                continue
            if False in build_status:
                has_failed += 1
                if build_status[-1] == True:
                    fail_then_pass +=1
            else:
                all_passed += 1
            if revision.published and build_status[-1] == False:
                fail_last += 1
        print()
        print(f'Out of the {len(self.revisions)} Revisions:')
        print(f'   {no_builds} had no builds.')
        print(f'   {has_failed} had failed builds.')
        print(f'   {fail_then_pass} had failed builds, but the last build passed.')
        print(f'   {all_passed} had only successful builds.')
        print(f'   {fail_last} were published with a failing build.')
            

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('repo_path')
    args = parser.parse_args()
    puller = PhabBuildPuller(args.repo_path)
    puller.run()