From bff72ce3b582c857840ef83c7a5647d24449d4f0 Mon Sep 17 00:00:00 2001 From: epriestley Date: Wed, 18 Sep 2019 15:19:40 -0700 Subject: [PATCH] Generate more friendly anchor names for header sections in Remarkup Summary: Depends on D20820. Ref T13410. We currently cut anchor names in the middle, don't support emoji in anchors, and generate relatively short anchors. Generate slightly longer anchors, allow more unicode, and try not to cut things in the middle. Test Plan: Created a document with a variety of different anchors and saw them generate more usable names. Maniphest Tasks: T13410 Differential Revision: https://secure.phabricator.com/D20821 --- src/__phutil_library_map__.php | 2 + .../__tests__/PhabricatorAnchorTestCase.php | 38 ++++++++++ .../PhutilRemarkupHeaderBlockRule.php | 70 +++++++++++++------ .../markuprule/PhutilRemarkupBoldRule.php | 4 ++ .../markup/remarkup/PhutilRemarkupEngine.php | 4 ++ .../remarkup/__tests__/remarkup/toc.txt | 6 +- 6 files changed, 100 insertions(+), 24 deletions(-) create mode 100644 src/infrastructure/markup/__tests__/PhabricatorAnchorTestCase.php diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php index f062b577c3..e5bd0a40e3 100644 --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -2150,6 +2150,7 @@ phutil_register_library_map(array( 'PhabricatorAlmanacApplication' => 'applications/almanac/application/PhabricatorAlmanacApplication.php', 'PhabricatorAmazonAuthProvider' => 'applications/auth/provider/PhabricatorAmazonAuthProvider.php', 'PhabricatorAmazonSNSFuture' => 'applications/metamta/future/PhabricatorAmazonSNSFuture.php', + 'PhabricatorAnchorTestCase' => 'infrastructure/markup/__tests__/PhabricatorAnchorTestCase.php', 'PhabricatorAnchorView' => 'view/layout/PhabricatorAnchorView.php', 'PhabricatorAphlictManagementDebugWorkflow' => 'applications/aphlict/management/PhabricatorAphlictManagementDebugWorkflow.php', 'PhabricatorAphlictManagementNotifyWorkflow' => 'applications/aphlict/management/PhabricatorAphlictManagementNotifyWorkflow.php', @@ -8314,6 +8315,7 @@ phutil_register_library_map(array( 'PhabricatorAlmanacApplication' => 'PhabricatorApplication', 'PhabricatorAmazonAuthProvider' => 'PhabricatorOAuth2AuthProvider', 'PhabricatorAmazonSNSFuture' => 'PhutilAWSFuture', + 'PhabricatorAnchorTestCase' => 'PhabricatorTestCase', 'PhabricatorAnchorView' => 'AphrontView', 'PhabricatorAphlictManagementDebugWorkflow' => 'PhabricatorAphlictManagementWorkflow', 'PhabricatorAphlictManagementNotifyWorkflow' => 'PhabricatorAphlictManagementWorkflow', diff --git a/src/infrastructure/markup/__tests__/PhabricatorAnchorTestCase.php b/src/infrastructure/markup/__tests__/PhabricatorAnchorTestCase.php new file mode 100644 index 0000000000..2b06fd693d --- /dev/null +++ b/src/infrastructure/markup/__tests__/PhabricatorAnchorTestCase.php @@ -0,0 +1,38 @@ + '', + 'Bells and Whistles' => 'bells-and-whistles', + 'Termination for Nonpayment' => 'termination-for-nonpayment', + $low_ascii => '0123456789-abcdefghijklmnopqrstu', + 'xxxx xxxx xxxx xxxx xxxx on' => 'xxxx-xxxx-xxxx-xxxx-xxxx', + 'xxxx xxxx xxxx xxxx xxxx ox' => 'xxxx-xxxx-xxxx-xxxx-xxxx-ox', + "So, You Want To Build A {$snowman}?" => + "so-you-want-to-build-a-{$snowman}", + str_repeat($snowman, 128) => str_repeat($snowman, 32), + ); + + foreach ($map as $input => $expect) { + $anchor = PhutilRemarkupHeaderBlockRule::getAnchorNameFromHeaderText( + $input); + + $this->assertEqual( + $expect, + $anchor, + pht('Anchor for "%s".', $input)); + } + } + +} diff --git a/src/infrastructure/markup/blockrule/PhutilRemarkupHeaderBlockRule.php b/src/infrastructure/markup/blockrule/PhutilRemarkupHeaderBlockRule.php index cbcb143339..2470c18118 100644 --- a/src/infrastructure/markup/blockrule/PhutilRemarkupHeaderBlockRule.php +++ b/src/infrastructure/markup/blockrule/PhutilRemarkupHeaderBlockRule.php @@ -73,24 +73,7 @@ final class PhutilRemarkupHeaderBlockRule extends PhutilRemarkupBlockRule { } private function generateAnchor($level, $text) { - $anchor = strtolower($text); - $anchor = preg_replace('/[^a-z0-9]/', '-', $anchor); - $anchor = preg_replace('/--+/', '-', $anchor); - $anchor = trim($anchor, '-'); - $anchor = substr($anchor, 0, 24); - $anchor = trim($anchor, '-'); - $base = $anchor; - - $key = self::KEY_HEADER_TOC; $engine = $this->getEngine(); - $anchors = $engine->getTextMetadata($key, array()); - - $suffix = 1; - while (!strlen($anchor) || isset($anchors[$anchor])) { - $anchor = $base.'-'.$suffix; - $anchor = trim($anchor, '-'); - $suffix++; - } // When a document contains a link inside a header, like this: // @@ -100,12 +83,30 @@ final class PhutilRemarkupHeaderBlockRule extends PhutilRemarkupBlockRule { // header itself. We push the 'toc' state so all the link rules generate // just names. $engine->pushState('toc'); - $text = $this->applyRules($text); - $text = $engine->restoreText($text); - - $anchors[$anchor] = array($level, $text); + $plain_text = $text; + $plain_text = $this->applyRules($plain_text); + $plain_text = $engine->restoreText($plain_text); $engine->popState('toc'); + $anchor = self::getAnchorNameFromHeaderText($plain_text); + + if (!strlen($anchor)) { + return null; + } + + $base = $anchor; + + $key = self::KEY_HEADER_TOC; + $anchors = $engine->getTextMetadata($key, array()); + + $suffix = 1; + while (isset($anchors[$anchor])) { + $anchor = $base.'-'.$suffix; + $anchor = trim($anchor, '-'); + $suffix++; + } + + $anchors[$anchor] = array($level, $plain_text); $engine->setTextMetadata($key, $anchors); return phutil_tag( @@ -159,4 +160,31 @@ final class PhutilRemarkupHeaderBlockRule extends PhutilRemarkupBlockRule { return phutil_implode_html("\n", $toc); } + public static function getAnchorNameFromHeaderText($text) { + $anchor = phutil_utf8_strtolower($text); + + // Replace all latin characters which are not "a-z" or "0-9" with "-". + // Preserve other characters, since non-latin letters and emoji work + // fine in anchors. + $anchor = preg_replace('/[\x00-\x2F\x3A-\x60\x7B-\x7F]+/', '-', $anchor); + $anchor = trim($anchor, '-'); + + // Truncate the fragment to something reasonable. + $anchor = id(new PhutilUTF8StringTruncator()) + ->setMaximumGlyphs(32) + ->setTerminator('') + ->truncateString($anchor); + + // If the fragment is terminated by a word which "The U.S. Government + // Printing Office Style Manual" normally discourages capitalizing in + // titles, discard it. This is an arbitrary heuristic intended to avoid + // awkward hanging words in anchors. + $anchor = preg_replace( + '/-(a|an|the|at|by|for|in|of|on|per|to|up|and|as|but|if|or|nor)\z/', + '', + $anchor); + + return $anchor; + } + } diff --git a/src/infrastructure/markup/markuprule/PhutilRemarkupBoldRule.php b/src/infrastructure/markup/markuprule/PhutilRemarkupBoldRule.php index 4640df2b2e..6d056db0a0 100644 --- a/src/infrastructure/markup/markuprule/PhutilRemarkupBoldRule.php +++ b/src/infrastructure/markup/markuprule/PhutilRemarkupBoldRule.php @@ -18,6 +18,10 @@ final class PhutilRemarkupBoldRule extends PhutilRemarkupRule { } protected function applyCallback(array $matches) { + if ($this->getEngine()->isAnchorMode()) { + return $matches[1]; + } + return hsprintf('%s', $matches[1]); } diff --git a/src/infrastructure/markup/remarkup/PhutilRemarkupEngine.php b/src/infrastructure/markup/remarkup/PhutilRemarkupEngine.php index 1c5ff78607..0b01fd6a69 100644 --- a/src/infrastructure/markup/remarkup/PhutilRemarkupEngine.php +++ b/src/infrastructure/markup/remarkup/PhutilRemarkupEngine.php @@ -34,6 +34,10 @@ final class PhutilRemarkupEngine extends PhutilMarkupEngine { return $this->mode & self::MODE_TEXT; } + public function isAnchorMode() { + return $this->getState('toc'); + } + public function isHTMLMailMode() { return $this->mode & self::MODE_HTML_MAIL; } diff --git a/src/infrastructure/markup/remarkup/__tests__/remarkup/toc.txt b/src/infrastructure/markup/remarkup/__tests__/remarkup/toc.txt index 43448f75b5..9505271576 100644 --- a/src/infrastructure/markup/remarkup/__tests__/remarkup/toc.txt +++ b/src/infrastructure/markup/remarkup/__tests__/remarkup/toc.txt @@ -6,14 +6,14 @@ ~~~~~~~~~~ -

link_name

+

link_name

bold