From e0b86cc81b39e73bb227643125b909f52f12abdc Mon Sep 17 00:00:00 2001 From: epriestley Date: Fri, 16 Sep 2011 03:56:23 -0700 Subject: [PATCH] Add a Mercurial commit discovery daemon Summary: Repository import has three major steps: - Commit discovery (serial) - Message parsing (parallel, mostly VCS independent) - Change parsing (parallel, highly VCS dependent) This implements commit discovery for Mercurial, similar to git's parsing: - List the heads of all the branches. - If we haven't already discovered them, follow them back to their roots (or the first commit we have discovered). - Import all the newly discovered commits, oldest first. This is a little complicated but it ensures we discover commits in depth order, so the discovery process is robust against interruption/failure. If we just inserted commits as we went, we might read the tip, insert it, and then crash. When we ran again, we'd think we had already discovered commits older than HEAD. This also allows later stages to rely on being able to find Phabricator commit IDs which correspond to parent commits. NOTE: This importer is fairly slow because "hg" has a large startup time (compare "hg --version" to "git --version" and "svn --version"; on my machine, hg has 60ms of overhead for any command) and we need to run many commands (see the whole "hg id" mess). You can expect something like 10,000 per hour, which means you may need to run overnight to discover a large repository (IIRC, the svn/git discovery processes are both about an order of magnitude faster). We could improve this with batching, but I want to keep it as simple as possible for now. Test Plan: Discovered all the commits in the main Mercurial repository, http://selenic.com/repo/hg. Reviewers: Makinde, jungejason, nh, tuomaspelkonen, aran Reviewed By: Makinde CC: aran, Makinde Differential Revision: 943 --- .../daemon/phabricator_daemon_launcher.php | 19 ++- src/__phutil_library_map__.php | 2 + ...positoryMercurialCommitDiscoveryDaemon.php | 138 ++++++++++++++++++ .../commitdiscovery/mercurial/__init__.php | 15 ++ 4 files changed, 172 insertions(+), 2 deletions(-) create mode 100644 src/applications/repository/daemon/commitdiscovery/mercurial/PhabricatorRepositoryMercurialCommitDiscoveryDaemon.php create mode 100644 src/applications/repository/daemon/commitdiscovery/mercurial/__init__.php diff --git a/scripts/daemon/phabricator_daemon_launcher.php b/scripts/daemon/phabricator_daemon_launcher.php index 4267a3a3c7..5fe297a6cd 100755 --- a/scripts/daemon/phabricator_daemon_launcher.php +++ b/scripts/daemon/phabricator_daemon_launcher.php @@ -85,7 +85,7 @@ switch (isset($argv[1]) ? $argv[1] : 'help') { $phid = $repository->getPHID(); switch ($repository->getVersionControlSystem()) { - case 'git': + case PhabricatorRepositoryType::REPOSITORY_TYPE_GIT: echo "Launching 'git fetch' daemon on the {$desc} repository...\n"; $control->launchDaemon( 'PhabricatorRepositoryGitFetchDaemon', @@ -99,7 +99,7 @@ switch (isset($argv[1]) ? $argv[1] : 'help') { $phid, )); break; - case 'svn': + case PhabricatorRepositoryType::REPOSITORY_TYPE_SVN: echo "Launching discovery daemon on the {$desc} repository...\n"; $control->launchDaemon( 'PhabricatorRepositorySvnCommitDiscoveryDaemon', @@ -107,6 +107,21 @@ switch (isset($argv[1]) ? $argv[1] : 'help') { $phid, )); break; + case PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL: + echo "Launching 'hg pull' daemon on the {$desc} repository...\n"; + $control->launchDaemon( + 'PhabricatorRepositoryMercurialPullDaemon', + array( + $phid, + )); + echo "Launching discovery daemon on the {$desc} repository...\n"; + $control->launchDaemon( + 'PhabricatorRepositoryMercurialCommitDiscoveryDaemon', + array( + $phid, + )); + break; + } } diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php index fc28cb30f0..1436e99dae 100644 --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -578,6 +578,7 @@ phutil_register_library_map(array( 'PhabricatorRepositoryGitHubNotification' => 'applications/repository/storage/githubnotification', 'PhabricatorRepositoryGitHubPostReceiveController' => 'applications/repository/controller/github-post-receive', 'PhabricatorRepositoryListController' => 'applications/repository/controller/list', + 'PhabricatorRepositoryMercurialCommitDiscoveryDaemon' => 'applications/repository/daemon/commitdiscovery/mercurial', 'PhabricatorRepositoryMercurialPullDaemon' => 'applications/repository/daemon/mercurialpull', 'PhabricatorRepositoryPullLocalDaemon' => 'applications/repository/daemon/pulllocal', 'PhabricatorRepositoryShortcut' => 'applications/repository/storage/shortcut', @@ -1178,6 +1179,7 @@ phutil_register_library_map(array( 'PhabricatorRepositoryGitHubNotification' => 'PhabricatorRepositoryDAO', 'PhabricatorRepositoryGitHubPostReceiveController' => 'PhabricatorRepositoryController', 'PhabricatorRepositoryListController' => 'PhabricatorRepositoryController', + 'PhabricatorRepositoryMercurialCommitDiscoveryDaemon' => 'PhabricatorRepositoryCommitDiscoveryDaemon', 'PhabricatorRepositoryMercurialPullDaemon' => 'PhabricatorRepositoryPullLocalDaemon', 'PhabricatorRepositoryPullLocalDaemon' => 'PhabricatorRepositoryDaemon', 'PhabricatorRepositoryShortcut' => 'PhabricatorRepositoryDAO', diff --git a/src/applications/repository/daemon/commitdiscovery/mercurial/PhabricatorRepositoryMercurialCommitDiscoveryDaemon.php b/src/applications/repository/daemon/commitdiscovery/mercurial/PhabricatorRepositoryMercurialCommitDiscoveryDaemon.php new file mode 100644 index 0000000000..01de317162 --- /dev/null +++ b/src/applications/repository/daemon/commitdiscovery/mercurial/PhabricatorRepositoryMercurialCommitDiscoveryDaemon.php @@ -0,0 +1,138 @@ +getRepository(); + + $vcs = $repository->getVersionControlSystem(); + if ($vcs != PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL) { + throw new Exception("Repository is not a Mercurial repository."); + } + + $repository_phid = $repository->getPHID(); + + $repo_base = $repository->getDetail('local-path'); + list($stdout) = $repository->execxLocalCommand('branches'); + + $branches = ArcanistMercurialParser::parseMercurialBranches($stdout); + $got_something = false; + foreach ($branches as $name => $branch) { + $commit = $branch['rev']; + $commit = $this->getFullHash($commit); + if ($this->isKnownCommit($commit)) { + continue; + } else { + $this->discoverCommit($commit); + $got_something = true; + } + } + + return $got_something; + } + + private function getFullHash($commit) { + + // NOTE: Mercurial shortens hashes to 12 characters by default. This + // implies collisions with as few as a few million commits. The + // documentation sensibly advises "Do not use short-form IDs for + // long-lived representations". It then continues "You can use the + // --debug option to display the full changeset ID". What?! Yes, this + // is in fact the only way to turn on full hashes, and the hg source + // code is littered with "hexfn = ui.debugflag and hex or short" and + // similar. There is no more-selective flag or config option. + // + // Unfortunately, "hg --debug" turns on tons of other extra output, + // including full commit messages in "hg log" and "hg parents" (which + // ignore --style); this renders them unparseable. So we have to use + // "hg id" to convert short hashes into full hashes. See: + // + // + // + // Of course, this means that if there are collisions we will break here + // (the short commit identifier won't be unambiguous) but maybe Mercurial + // will have a --full-hashes flag or something by then and we can fix it + // properly. Until we run into that, this allows us to store data in the + // right format so when we eventually encounter this we won't have to + // reparse every Mercurial repository. + + $repository = $this->getRepository(); + list($stdout) = $repository->execxLocalCommand( + 'id --debug -i --rev %s', + $commit); + return trim($stdout); + } + + private function discoverCommit($commit) { + $discover = array(); + $insert = array(); + + $repository = $this->getRepository(); + + $discover[] = $commit; + $insert[] = $commit; + + $seen_parent = array(); + + // For all the new commits at the branch heads, walk backward until we find + // only commits we've aleady seen. + while (true) { + $target = array_pop($discover); + list($stdout) = $repository->execxLocalCommand( + 'parents --style default --rev %s', + $target); + $parents = ArcanistMercurialParser::parseMercurialLog($stdout); + if ($parents) { + foreach ($parents as $parent) { + $parent_commit = $parent['rev']; + $parent_commit = $this->getFullHash($parent_commit); + if (isset($seen_parent[$parent_commit])) { + continue; + } + $seen_parent[$parent_commit] = true; + if (!$this->isKnownCommit($parent_commit)) { + $discover[] = $parent_commit; + $insert[] = $parent_commit; + } + } + } + if (empty($discover)) { + break; + } + $this->stillWorking(); + } + + while (true) { + $target = array_pop($insert); + list($stdout) = $repository->execxLocalCommand( + 'log --rev %s --template %s', + $target, + '{date|rfc822date}'); + $epoch = strtotime($stdout); + + $this->recordCommit($target, $epoch); + + if (empty($insert)) { + break; + } + } + } + +} diff --git a/src/applications/repository/daemon/commitdiscovery/mercurial/__init__.php b/src/applications/repository/daemon/commitdiscovery/mercurial/__init__.php new file mode 100644 index 0000000000..e18b3c3941 --- /dev/null +++ b/src/applications/repository/daemon/commitdiscovery/mercurial/__init__.php @@ -0,0 +1,15 @@ +