From 6b99aac49ddf4aff524162b6757cf7491f146c83 Mon Sep 17 00:00:00 2001 From: epriestley Date: Mon, 22 Jan 2018 07:23:04 -0800 Subject: [PATCH] Digest changeset anchors into purely alphanumeric strings Summary: Ref T13045. See that task for discussion. This replaces `digestForIndex()` with a "clever" algorithm in `digestForAnchor()`. The new digest is the same as `digestForIndex()` except when the original output was "." or "_". In those cases, a replacement character is selected based on entropy accumulated by the digest function as it iterates through the string. Test Plan: Added unit tests. Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T13045 Differential Revision: https://secure.phabricator.com/D18909 --- .../storage/DifferentialChangeset.php | 2 +- src/infrastructure/util/PhabricatorHash.php | 61 ++++++++++++++++++- .../__tests__/PhabricatorHashTestCase.php | 43 +++++++++++++ 3 files changed, 103 insertions(+), 3 deletions(-) diff --git a/src/applications/differential/storage/DifferentialChangeset.php b/src/applications/differential/storage/DifferentialChangeset.php index 3656fe6507..ebdaeacd0a 100644 --- a/src/applications/differential/storage/DifferentialChangeset.php +++ b/src/applications/differential/storage/DifferentialChangeset.php @@ -173,7 +173,7 @@ final class DifferentialChangeset } public function getAnchorName() { - return 'change-'.PhabricatorHash::digestForIndex($this->getFilename()); + return 'change-'.PhabricatorHash::digestForAnchor($this->getFilename()); } public function getAbsoluteRepositoryPath( diff --git a/src/infrastructure/util/PhabricatorHash.php b/src/infrastructure/util/PhabricatorHash.php index 4cd1abcf15..ce48b0966b 100644 --- a/src/infrastructure/util/PhabricatorHash.php +++ b/src/infrastructure/util/PhabricatorHash.php @@ -3,6 +3,7 @@ final class PhabricatorHash extends Phobject { const INDEX_DIGEST_LENGTH = 12; + const ANCHOR_DIGEST_LENGTH = 12; /** * Digest a string using HMAC+SHA1. @@ -38,8 +39,8 @@ final class PhabricatorHash extends Phobject { * related hashing (for general purpose hashing, see @{method:digest}). * * @param string Input string. - * @return string 12-byte, case-sensitive alphanumeric hash of the string - * which + * @return string 12-byte, case-sensitive, mostly-alphanumeric hash of + * the string. */ public static function digestForIndex($string) { $hash = sha1($string, $raw_output = true); @@ -63,6 +64,62 @@ final class PhabricatorHash extends Phobject { return $result; } + /** + * Digest a string for use in HTML page anchors. This is similar to + * @{method:digestForIndex} but produces purely alphanumeric output. + * + * This tries to be mostly compatible with the index digest to limit how + * much stuff we're breaking by switching to it. For additional discussion, + * see T13045. + * + * @param string Input string. + * @return string 12-byte, case-sensitive, purely-alphanumeric hash of + * the string. + */ + public static function digestForAnchor($string) { + $hash = sha1($string, $raw_output = true); + + static $map; + if ($map === null) { + $map = '0123456789'. + 'abcdefghij'. + 'klmnopqrst'. + 'uvwxyzABCD'. + 'EFGHIJKLMN'. + 'OPQRSTUVWX'. + 'YZ'; + } + + $result = ''; + $accum = 0; + $map_size = strlen($map); + for ($ii = 0; $ii < self::ANCHOR_DIGEST_LENGTH; $ii++) { + $byte = ord($hash[$ii]); + $low_bits = ($byte & 0x3F); + $accum = ($accum + $byte) % $map_size; + + if ($low_bits < $map_size) { + // If an index digest would produce any alphanumeric character, just + // use that character. This means that these digests are the same as + // digests created with "digestForIndex()" in all positions where the + // output character is some character other than "." or "_". + $result .= $map[$low_bits]; + } else { + // If an index digest would produce a non-alphumeric character ("." or + // "_"), pick an alphanumeric character instead. We accumulate an + // index into the alphanumeric character list to try to preserve + // entropy here. We could use this strategy for all bytes instead, + // but then these digests would differ from digests created with + // "digestForIndex()" in all positions, instead of just a small number + // of positions. + $result .= $map[$accum]; + } + } + + return $result; + } + + public static function digestToRange($string, $min, $max) { if ($min > $max) { throw new Exception(pht('Maximum must be larger than minimum.')); diff --git a/src/infrastructure/util/__tests__/PhabricatorHashTestCase.php b/src/infrastructure/util/__tests__/PhabricatorHashTestCase.php index b89515701d..008270e767 100644 --- a/src/infrastructure/util/__tests__/PhabricatorHashTestCase.php +++ b/src/infrastructure/util/__tests__/PhabricatorHashTestCase.php @@ -52,4 +52,47 @@ final class PhabricatorHashTestCase extends PhabricatorTestCase { pht('Distinct characters in hash of: %s', $input)); } + public function testHashForAnchor() { + $map = array( + // For inputs with no "." or "_" in the output, digesting for an index + // or an anchor should be the same. + 'dog' => array( + 'Aliif7Qjd5ct', + 'Aliif7Qjd5ct', + ), + // When an output would contain "." or "_", it should be replaced with + // an alphanumeric character in those positions instead. + 'fig' => array( + 'OpB9tY4i.MOX', + 'OpB9tY4imMOX', + ), + 'cot' => array( + '_iF26XU_PsVY', + '3iF26XUkPsVY', + ), + // The replacement characters are well-distributed and generally keep + // the entropy of the output high: here, "_" characters in the initial + // positions of the digests of "cot" (above) and "dug" (this test) have + // different outputs. + 'dug' => array( + '_XuQnp0LUlUW', + '7XuQnp0LUlUW', + ), + ); + + foreach ($map as $input => $expect) { + list($expect_index, $expect_anchor) = $expect; + + $this->assertEqual( + $expect_index, + PhabricatorHash::digestForIndex($input), + pht('Index digest of "%s".', $input)); + + $this->assertEqual( + $expect_anchor, + PhabricatorHash::digestForAnchor($input), + pht('Anchor digest of "%s".', $input)); + } + } + }