mirror of
https://we.phorge.it/source/arcanist.git
synced 2025-02-09 05:18:29 +01:00
Summary: Ref T13395. Moves all remaining code in "libphutil/" into "arcanist/". Test Plan: Ran various arc workflows, although this probably has some remaining rough edges. Maniphest Tasks: T13395 Differential Revision: https://secure.phabricator.com/D20980
295 lines
9.8 KiB
PHP
295 lines
9.8 KiB
PHP
<?php
|
|
|
|
/**
|
|
* Truncate a UTF-8 string to a some maximum number of bytes, codepoints, or
|
|
* glyphs.
|
|
*
|
|
* This class takes some UTF-8 string as input, and emits a valid UTF-8 string
|
|
* which is no longer than given byte, codepoint, or glyph limits.
|
|
*
|
|
* $short = id(new PhutilUTF8StringTruncator())
|
|
* ->setMaximumGlyphs(80)
|
|
* ->truncateString($long);
|
|
*
|
|
* Byte limits restrict the number of bytes the result may contain. They are
|
|
* appropriate when you care about how much storage a string requires.
|
|
*
|
|
* Codepoint limits restrict the number of codepoints the result may contain.
|
|
* Since codepoints may have up to 4 bytes, the resulting strings may require
|
|
* have more than this many bytes. This kind of limit is appropriate when you
|
|
* are using UTF-8 storage.
|
|
*
|
|
* Glyph limits restrict the display size of the string. Because a single glyph
|
|
* may have an arbitrary number of combining characters, this does not impose
|
|
* a storage size limit on the string: a string with only one glyph may require
|
|
* an arbitrarily large number of bytes.
|
|
*
|
|
* You can set more than one limit; the smallest limit will be used.
|
|
*
|
|
* NOTE: This function makes a best effort to apply some reasonable rules but
|
|
* will not work well for the full range of unicode languages.
|
|
*/
|
|
final class PhutilUTF8StringTruncator extends Phobject {
|
|
|
|
private $maximumBytes;
|
|
private $maximumCodepoints;
|
|
private $maximumGlyphs;
|
|
private $minimumLimit;
|
|
|
|
private $terminator = "\xE2\x80\xA6";
|
|
private $terminatorBytes = 3;
|
|
private $terminatorCodepoints = 1;
|
|
private $terminatorGlyphs = 1;
|
|
|
|
public function setMaximumBytes($maximum_bytes) {
|
|
$this->maximumBytes = $maximum_bytes;
|
|
$this->didUpdateMaxima();
|
|
return $this;
|
|
}
|
|
|
|
public function setMaximumCodepoints($maximum_codepoints) {
|
|
$this->maximumCodepoints = $maximum_codepoints;
|
|
$this->didUpdateMaxima();
|
|
return $this;
|
|
}
|
|
|
|
public function setMaximumGlyphs($maximum_glyphs) {
|
|
$this->maximumGlyphs = $maximum_glyphs;
|
|
$this->didUpdateMaxima();
|
|
return $this;
|
|
}
|
|
|
|
private function didUpdateMaxima() {
|
|
$this->minimumLimit = INF;
|
|
|
|
if ($this->maximumBytes) {
|
|
$this->minimumLimit = min($this->minimumLimit, $this->maximumBytes);
|
|
}
|
|
|
|
if ($this->maximumCodepoints) {
|
|
$this->minimumLimit = min($this->minimumLimit, $this->maximumCodepoints);
|
|
}
|
|
|
|
if ($this->maximumGlyphs) {
|
|
$this->minimumLimit = min($this->minimumLimit, $this->maximumGlyphs);
|
|
}
|
|
}
|
|
|
|
public function setTerminator($terminator) {
|
|
$this->terminator = $terminator;
|
|
$this->terminatorBytes = strlen($terminator);
|
|
$this->terminatorCodepoints = count(phutil_utf8v($terminator));
|
|
$this->terminatorGlyphs = count(phutil_utf8v_combined($terminator));
|
|
return $this;
|
|
}
|
|
|
|
public function truncateString($string) {
|
|
// First, check if the string has fewer bytes than the most restrictive
|
|
// limit. Codepoints and glyphs always take up at least one byte, so we can
|
|
// just return the string unmodified if we're under all of the limits.
|
|
$byte_len = strlen($string);
|
|
if ($byte_len <= $this->minimumLimit) {
|
|
return $string;
|
|
}
|
|
|
|
// We're going to vectorize the string so we can deal with it in terms
|
|
// of unicode characters. If the string is huge (like 10MB) and we are
|
|
// only extracting a tiny piece of it (like the first 1024 bytes), we
|
|
// want to avoid vectorizing the entire string in cases where there is
|
|
// no possibility that we'll need all of it. Try to compute a "hard limit":
|
|
// an upper bound on the number of bytes we can ever need to process.
|
|
|
|
$hard_limits = array();
|
|
if ($this->maximumBytes) {
|
|
$hard_limits[] = $this->maximumBytes;
|
|
}
|
|
|
|
if ($this->maximumCodepoints) {
|
|
// No UTF8 character is longer than 6 bytes, so we can impose a ceiling
|
|
// if we have a codepoint limit.
|
|
$hard_limits[] = ($this->maximumCodepoints * 6);
|
|
}
|
|
|
|
if ($hard_limits) {
|
|
// Add a few more bytes onto the end so that we have a little more of
|
|
// the string than we actually need and can get the right terminator
|
|
// behavior.
|
|
$hard_limit = max($hard_limits) + 32;
|
|
} else {
|
|
$hard_limit = null;
|
|
}
|
|
|
|
// Build a vector of characters first.
|
|
$string_pv = phutil_utf8v($string, $hard_limit);
|
|
$point_len = count($string_pv);
|
|
|
|
// We always need the combined vector, even if we're only doing byte or
|
|
// codepoint truncation, because we don't want to truncate to half of a
|
|
// combining character.
|
|
$string_gv = phutil_utf8v_combine_characters($string_pv);
|
|
$glyph_len = count($string_gv);
|
|
|
|
// Now, check if we're still over the limits. For example, a string may
|
|
// be over the raw byte limit but under the glyph limit if it contains
|
|
// several multibyte characters.
|
|
|
|
$too_long = false;
|
|
if ($this->maximumBytes && ($byte_len > $this->maximumBytes)) {
|
|
$too_long = true;
|
|
}
|
|
if ($this->maximumCodepoints && ($point_len > $this->maximumCodepoints)) {
|
|
$too_long = true;
|
|
}
|
|
if ($this->maximumGlyphs && ($glyph_len > $this->maximumGlyphs)) {
|
|
$too_long = true;
|
|
}
|
|
|
|
if (!$too_long) {
|
|
return $string;
|
|
}
|
|
|
|
// This string is legitimately longer than at least one of the limits, so
|
|
// we need to truncate it. Find the minimum cutoff point: this is the last
|
|
// glyph we can possibly return while satisfying the limits and having space
|
|
// for the terminator.
|
|
|
|
$cutoff = $glyph_len;
|
|
if ($this->maximumBytes) {
|
|
if ($byte_len <= $this->maximumBytes) {
|
|
$cutoff = $glyph_len;
|
|
} else {
|
|
$bytes = $this->terminatorBytes;
|
|
for ($ii = 0; $ii < $glyph_len; $ii++) {
|
|
$bytes += strlen($string_gv[$ii]);
|
|
if ($bytes > $this->maximumBytes) {
|
|
$cutoff = $ii;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if ($this->maximumCodepoints) {
|
|
if ($point_len <= $this->maximumCodepoints) {
|
|
$cutoff = min($cutoff, $glyph_len);
|
|
} else {
|
|
$points = 0;
|
|
for ($ii = 0; $ii < $glyph_len; $ii++) {
|
|
$glyph_bytes = strlen($string_gv[$ii]);
|
|
while ($points < $point_len) {
|
|
$glyph_bytes -= strlen($string_pv[$points]);
|
|
$points++;
|
|
if ($glyph_bytes <= 0) {
|
|
break;
|
|
}
|
|
}
|
|
$points_total = $points + $this->terminatorCodepoints;
|
|
if ($points_total > $this->maximumCodepoints) {
|
|
$cutoff = min($cutoff, $ii);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if ($this->maximumGlyphs) {
|
|
if ($glyph_len <= $this->maximumGlyphs) {
|
|
$cutoff = min($cutoff, $glyph_len);
|
|
} else {
|
|
$cutoff = min($cutoff, $this->maximumGlyphs - $this->terminatorGlyphs);
|
|
}
|
|
}
|
|
|
|
// If we don't have enough characters for anything, just return the
|
|
// terminator.
|
|
if ($cutoff <= 0) {
|
|
return $this->terminator;
|
|
}
|
|
|
|
// Otherwise, we're going to try to cut the string off somewhere reasonable
|
|
// rather than somewhere arbitrary.
|
|
|
|
// NOTE: This is not complete, and there are many other word boundary
|
|
// characters and reasonable places to break words in the UTF-8 character
|
|
// space. For now, this gives us reasonable behavior for latin languages. We
|
|
// don't necessarily have access to PCRE+Unicode so there isn't a great way
|
|
// for us to look up character attributes.
|
|
|
|
// If we encounter these, prefer to break on them instead of cutting the
|
|
// string off in the middle of a word.
|
|
static $break_characters = array(
|
|
' ' => true,
|
|
"\n" => true,
|
|
';' => true,
|
|
':' => true,
|
|
'[' => true,
|
|
'(' => true,
|
|
',' => true,
|
|
'-' => true,
|
|
);
|
|
|
|
// If we encounter these, shorten to this character exactly without
|
|
// appending the terminal.
|
|
static $stop_characters = array(
|
|
'.' => true,
|
|
'!' => true,
|
|
'?' => true,
|
|
);
|
|
|
|
// Search backward in the string, looking for reasonable places to break it.
|
|
$word_boundary = null;
|
|
$stop_boundary = null;
|
|
$any_nonboundary = false;
|
|
|
|
// If we do a word break with a terminal, we have to look beyond at least
|
|
// the number of characters in the terminal. If the terminal is longer than
|
|
// the required length, we'll skip this whole block and return it on its
|
|
// own.
|
|
|
|
// Only search backward for a while. At some point we don't get a better
|
|
// result by looking through the whole string, and if this is "MMM..." or
|
|
// a non-latin language without word break characters we're just wasting
|
|
// time.
|
|
|
|
// See PHI654. We also only look for a break near the end of the text,
|
|
// relative to the length of the text. If the text is something like
|
|
// "O123: MMMMMM..." or "See path/to/long/thing", we want to cut the very
|
|
// long word in half, not just render "O123..." or "See...".
|
|
|
|
$search = max(0, $cutoff - 256, $cutoff / 2);
|
|
for ($ii = min($cutoff, $glyph_len - 1); $ii >= $search; $ii--) {
|
|
$c = $string_gv[$ii];
|
|
|
|
if (isset($break_characters[$c])) {
|
|
$word_boundary = $ii;
|
|
} else if (isset($stop_characters[$c])) {
|
|
$stop_boundary = $ii + 1;
|
|
break;
|
|
} else {
|
|
$any_nonboundary = true;
|
|
if ($word_boundary !== null) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if ($stop_boundary !== null) {
|
|
// We found a character like ".". Cut the string there, without appending
|
|
// the terminal.
|
|
$string_part = array_slice($string_gv, 0, $stop_boundary);
|
|
return implode('', $string_part);
|
|
}
|
|
|
|
// If we didn't find any boundary characters or we found ONLY boundary
|
|
// characters, just break at the maximum character length.
|
|
if ($word_boundary === null || !$any_nonboundary) {
|
|
$word_boundary = $cutoff;
|
|
}
|
|
|
|
$string_part = array_slice($string_gv, 0, $word_boundary);
|
|
$string_part = implode('', $string_part);
|
|
|
|
return $string_part.$this->terminator;
|
|
}
|
|
|
|
}
|