mirror of
https://we.phorge.it/source/arcanist.git
synced 2025-02-02 18:08:25 +01:00
9b74cb4ee6
Summary: Ref T13395. Moves all remaining code in "libphutil/" into "arcanist/". Test Plan: Ran various arc workflows, although this probably has some remaining rough edges. Maniphest Tasks: T13395 Differential Revision: https://secure.phabricator.com/D20980
562 lines
14 KiB
PHP
562 lines
14 KiB
PHP
<?php
|
|
|
|
/**
|
|
* Compute edit distance between two scalar sequences. This class uses
|
|
* Levenshtein (or Damerau-Levenshtein) to compute the edit distance between
|
|
* two inputs. The inputs are arrays containing any scalars (not just strings)
|
|
* so it can be used with, e.g., utf8 sequences.
|
|
*
|
|
* $matrix = id(new PhutilEditDistanceMatrix())
|
|
* ->setSequences(str_split('ran'), str_split('rat'));
|
|
*
|
|
* $cost = $matrix->getEditDistance();
|
|
*
|
|
* Edit distance computation is slow and requires a large amount of memory;
|
|
* both are roughly O(N * M) in the length of the strings.
|
|
*
|
|
* You can customize the cost of insertions, deletions and replacements. By
|
|
* default, each has a cost of 1.
|
|
*
|
|
* $matrix->setReplaceCost(1);
|
|
*
|
|
* By default, transpositions are not evaluated (i.e., the algorithm is
|
|
* Levenshtein). You can cause transpositions to be evaluated by setting a
|
|
* transpose cost (which will change the algorithm to Damerau-Levenshtein):
|
|
*
|
|
* $matrix->setTransposeCost(1);
|
|
*
|
|
* You can also provide a cost to alter the type of operation being applied.
|
|
* Many strings have several equivalently expensive edit sequences, but one
|
|
* some sequences are more readable to humans than others. Providing a small
|
|
* cost to alter operation type tends to smooth out the sequence and produce
|
|
* long runs of a single operation, which are generally more readable. For
|
|
* example, these strings:
|
|
*
|
|
* (x)
|
|
* ((x))
|
|
*
|
|
* ...have edit strings "issis" and "isssi", which describe edit operations with
|
|
* the same cost, but the latter string is more comprehensible to human viewers.
|
|
*
|
|
* If you set an alter cost, you must call @{method:setComputeString} to enable
|
|
* type computation. The alter cost should generally be smaller than `c / N`,
|
|
* where `c` is the smallest operational cost and `N` is the length of the
|
|
* longest string. For example, if you are using the default costs (insert = 1,
|
|
* delete = 1, replace = 1) and computing edit distances for strings of fewer
|
|
* than 1,000 characters, you might set the alter cost to 0.001.
|
|
*/
|
|
final class PhutilEditDistanceMatrix extends Phobject {
|
|
|
|
private $insertCost = 1;
|
|
private $deleteCost = 1;
|
|
private $replaceCost = 1;
|
|
private $transposeCost = null;
|
|
private $alterCost = 0;
|
|
private $maximumLength;
|
|
private $computeString;
|
|
private $applySmoothing = self::SMOOTHING_NONE;
|
|
private $reachedMaximumLength;
|
|
|
|
private $x;
|
|
private $y;
|
|
private $prefix;
|
|
private $suffix;
|
|
|
|
private $distanceMatrix = null;
|
|
private $typeMatrix = null;
|
|
|
|
const SMOOTHING_NONE = 'none';
|
|
const SMOOTHING_INTERNAL = 'internal';
|
|
const SMOOTHING_FULL = 'full';
|
|
|
|
public function setMaximumLength($maximum_length) {
|
|
$this->maximumLength = $maximum_length;
|
|
return $this;
|
|
}
|
|
|
|
public function getMaximumLength() {
|
|
return coalesce($this->maximumLength, $this->getInfinity());
|
|
}
|
|
|
|
public function didReachMaximumLength() {
|
|
return $this->reachedMaximumLength;
|
|
}
|
|
|
|
public function setComputeString($compute_string) {
|
|
$this->computeString = $compute_string;
|
|
return $this;
|
|
}
|
|
|
|
public function getComputeString() {
|
|
return $this->computeString;
|
|
}
|
|
|
|
public function setTransposeCost($transpose_cost) {
|
|
$this->transposeCost = $transpose_cost;
|
|
return $this;
|
|
}
|
|
|
|
public function getTransposeCost() {
|
|
return $this->transposeCost;
|
|
}
|
|
|
|
public function setReplaceCost($replace_cost) {
|
|
$this->replaceCost = $replace_cost;
|
|
return $this;
|
|
}
|
|
|
|
public function getReplaceCost() {
|
|
return $this->replaceCost;
|
|
}
|
|
|
|
public function setDeleteCost($delete_cost) {
|
|
$this->deleteCost = $delete_cost;
|
|
return $this;
|
|
}
|
|
|
|
public function getDeleteCost() {
|
|
return $this->deleteCost;
|
|
}
|
|
|
|
public function setInsertCost($insert_cost) {
|
|
$this->insertCost = $insert_cost;
|
|
return $this;
|
|
}
|
|
|
|
public function getInsertCost() {
|
|
return $this->insertCost;
|
|
}
|
|
|
|
public function setAlterCost($alter_cost) {
|
|
$this->alterCost = $alter_cost;
|
|
return $this;
|
|
}
|
|
|
|
public function getAlterCost() {
|
|
return $this->alterCost;
|
|
}
|
|
|
|
public function setApplySmoothing($apply_smoothing) {
|
|
$this->applySmoothing = $apply_smoothing;
|
|
return $this;
|
|
}
|
|
|
|
public function getApplySmoothing() {
|
|
return $this->applySmoothing;
|
|
}
|
|
|
|
public function setSequences(array $x, array $y) {
|
|
|
|
// NOTE: We strip common prefixes and suffixes from the inputs because
|
|
// the runtime of the edit distance algorithm is large and it is common
|
|
// to diff similar strings.
|
|
|
|
$xl = count($x);
|
|
$yl = count($y);
|
|
$l = min($xl, $yl);
|
|
|
|
$prefix_l = 0;
|
|
$suffix_l = 0;
|
|
|
|
for ($ii = 0; $ii < $l; $ii++) {
|
|
if ($x[$ii] !== $y[$ii]) {
|
|
break;
|
|
}
|
|
$prefix_l++;
|
|
}
|
|
|
|
for ($ii = 1; $ii <= ($l - $prefix_l); $ii++) {
|
|
if ($x[$xl - $ii] !== $y[$yl - $ii]) {
|
|
break;
|
|
}
|
|
$suffix_l++;
|
|
}
|
|
|
|
$this->prefix = array_slice($x, 0, $prefix_l);
|
|
$this->suffix = array_slice($x, $xl - $suffix_l);
|
|
|
|
$this->x = array_slice($x, $prefix_l, $xl - ($suffix_l + $prefix_l));
|
|
$this->y = array_slice($y, $prefix_l, $yl - ($suffix_l + $prefix_l));
|
|
$this->distanceMatrix = null;
|
|
|
|
return $this;
|
|
}
|
|
|
|
private function requireSequences() {
|
|
if ($this->x === null) {
|
|
throw new PhutilInvalidStateException('setSequences');
|
|
}
|
|
}
|
|
|
|
public function getEditDistance() {
|
|
$this->requireSequences();
|
|
|
|
$x = $this->x;
|
|
$y = $this->y;
|
|
|
|
if (!$x) {
|
|
return $this->insertCost * count($y);
|
|
}
|
|
|
|
if (!$y) {
|
|
return $this->deleteCost * count($x);
|
|
}
|
|
|
|
$max = $this->getMaximumLength();
|
|
if (count($x) > $max || count($y) > $max) {
|
|
$this->reachedMaximumLength = true;
|
|
return ($this->insertCost * count($y)) + ($this->deleteCost * count($x));
|
|
}
|
|
|
|
if ($x === $y) {
|
|
return 0;
|
|
}
|
|
|
|
$matrix = $this->getDistanceMatrix();
|
|
|
|
return $matrix[count($x)][count($y)];
|
|
}
|
|
|
|
/**
|
|
* Return a string representing the edits between the sequences. The string
|
|
* has these characters:
|
|
*
|
|
* - s (same): Same character in both strings.
|
|
* - i (insert): Character inserted.
|
|
* - d (delete): Character deleted.
|
|
* - x (replace): Character replaced.
|
|
* - t (transpose): Character transposed.
|
|
*/
|
|
public function getEditString() {
|
|
$this->requireSequences();
|
|
|
|
$x = $this->x;
|
|
$y = $this->y;
|
|
|
|
if (!$x && !$y) {
|
|
return $this->padEditString('');
|
|
}
|
|
|
|
if (!$x) {
|
|
return $this->padEditString(str_repeat('i', count($y)));
|
|
}
|
|
|
|
if (!$y) {
|
|
return $this->padEditString(str_repeat('d', count($x)));
|
|
}
|
|
|
|
if ($x === $y) {
|
|
return $this->padEditString(str_repeat('s', count($x)));
|
|
}
|
|
|
|
$max = $this->getMaximumLength();
|
|
if (count($x) > $max || count($y) > $max) {
|
|
$this->reachedMaximumLength = true;
|
|
return $this->padEditString(
|
|
str_repeat('d', count($x)).
|
|
str_repeat('i', count($y)));
|
|
}
|
|
|
|
$matrix = $this->getTypeMatrix();
|
|
|
|
$xx = count($x);
|
|
$yy = count($y);
|
|
|
|
$transposes = array();
|
|
$str = '';
|
|
while (true) {
|
|
$type = $matrix[$xx][$yy];
|
|
|
|
if (is_array($type)) {
|
|
$chr = 't';
|
|
$transposes[$type[0]][$type[1]] = true;
|
|
$type = $type[2];
|
|
} else {
|
|
$chr = $type;
|
|
}
|
|
|
|
if (isset($transposes[$xx][$yy])) {
|
|
$chr = 't';
|
|
}
|
|
|
|
if ($type == 's' || $type == 'x') {
|
|
$xx -= 1;
|
|
$yy -= 1;
|
|
} else if ($type == 'i') {
|
|
$yy -= 1;
|
|
} else if ($type == 'd') {
|
|
$xx -= 1;
|
|
} else {
|
|
throw new Exception(pht("Unknown type '%s' in type matrix.", $type));
|
|
}
|
|
|
|
$str .= $chr;
|
|
|
|
if ($xx <= 0 && $yy <= 0) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
$str = strrev($str);
|
|
|
|
// For full smoothing, we pad the edit string before smoothing it, so
|
|
// ranges of similar characters at the beginning or end of the string can
|
|
// also be smoothed.
|
|
|
|
// For internal smoothing, we only smooth ranges within the change itself.
|
|
|
|
$smoothing = $this->getApplySmoothing();
|
|
switch ($smoothing) {
|
|
case self::SMOOTHING_FULL:
|
|
$str = $this->padEditString($str);
|
|
$str = $this->applySmoothing($str, true);
|
|
break;
|
|
case self::SMOOTHING_INTERNAL:
|
|
$str = $this->applySmoothing($str, false);
|
|
$str = $this->padEditString($str);
|
|
break;
|
|
case self::SMOOTHING_NONE:
|
|
$str = $this->padEditString($str);
|
|
break;
|
|
default:
|
|
throw new Exception(
|
|
pht(
|
|
'Unknown smoothing type "%s".',
|
|
$smoothing));
|
|
}
|
|
|
|
return $str;
|
|
}
|
|
|
|
private function padEditString($str) {
|
|
if ($this->prefix) {
|
|
$str = str_repeat('s', count($this->prefix)).$str;
|
|
}
|
|
if ($this->suffix) {
|
|
$str = $str.str_repeat('s', count($this->suffix));
|
|
}
|
|
return $str;
|
|
}
|
|
|
|
private function getTypeMatrix() {
|
|
if (!$this->computeString) {
|
|
throw new PhutilInvalidStateException('setComputeString');
|
|
}
|
|
if ($this->typeMatrix === null) {
|
|
$this->computeMatrix($this->x, $this->y);
|
|
}
|
|
return $this->typeMatrix;
|
|
}
|
|
|
|
private function getDistanceMatrix() {
|
|
if ($this->distanceMatrix === null) {
|
|
$this->computeMatrix($this->x, $this->y);
|
|
}
|
|
return $this->distanceMatrix;
|
|
}
|
|
|
|
private function computeMatrix(array $x, array $y) {
|
|
$xl = count($x);
|
|
$yl = count($y);
|
|
|
|
$m = array();
|
|
|
|
$infinity = $this->getInfinity();
|
|
|
|
$use_damerau = ($this->transposeCost !== null);
|
|
if ($use_damerau) {
|
|
// Initialize the default cost of a transpose.
|
|
$m[-1][-1] = $infinity;
|
|
|
|
// Initialize the character position dictionary for Damerau.
|
|
$d = array();
|
|
foreach ($x as $c) {
|
|
$d[$c] = -1;
|
|
}
|
|
foreach ($y as $c) {
|
|
$d[$c] = -1;
|
|
}
|
|
|
|
// Initialize the transpose costs for Damerau.
|
|
for ($xx = 0; $xx <= $xl; $xx++) {
|
|
$m[$xx][-1] = $infinity;
|
|
}
|
|
|
|
for ($yy = 0; $yy <= $yl; $yy++) {
|
|
$m[-1][$yy] = $infinity;
|
|
}
|
|
}
|
|
|
|
// Initialize the top row of the matrix.
|
|
for ($xx = 0; $xx <= $xl; $xx++) {
|
|
$m[$xx][0] = $xx * $this->deleteCost;
|
|
}
|
|
|
|
// Initialize the left column of the matrix.
|
|
$cost = 0;
|
|
for ($yy = 0; $yy <= $yl; $yy++) {
|
|
$m[0][$yy] = $yy * $this->insertCost;
|
|
}
|
|
|
|
$use_types = ($this->computeString);
|
|
if ($use_types) {
|
|
$t = array();
|
|
for ($xx = 0; $xx <= $xl; $xx++) {
|
|
$t[$xx][0] = 'd';
|
|
}
|
|
for ($yy = 0; $yy <= $yl; $yy++) {
|
|
$t[0][$yy] = 'i';
|
|
}
|
|
$t[0][0] = 's';
|
|
}
|
|
|
|
$alt_cost = $this->getAlterCost();
|
|
if ($alt_cost && !$use_types) {
|
|
throw new Exception(
|
|
pht(
|
|
'If you provide an alter cost with %s, you must enable '.
|
|
'type computation with %s.',
|
|
'setAlterCost()',
|
|
'setComputeStrings()'));
|
|
}
|
|
|
|
// Build the edit distance matrix.
|
|
for ($xx = 1; $xx <= $xl; $xx++) {
|
|
$last_dy = -1;
|
|
for ($yy = 1; $yy <= $yl; $yy++) {
|
|
if ($use_damerau) {
|
|
$dx = $d[$y[$yy - 1]];
|
|
$dy = $last_dy;
|
|
}
|
|
|
|
if ($x[$xx - 1] === $y[$yy - 1]) {
|
|
$rep_cost = $m[$xx - 1][$yy - 1] + 0;
|
|
$rep_type = 's';
|
|
} else {
|
|
$rep_cost = $m[$xx - 1][$yy - 1] + $this->replaceCost;
|
|
$rep_type = 'x';
|
|
}
|
|
|
|
$del_cost = $m[$xx - 1][$yy ] + $this->deleteCost;
|
|
$ins_cost = $m[$xx ][$yy - 1] + $this->insertCost;
|
|
if ($alt_cost) {
|
|
$del_char = $t[$xx - 1][$yy ];
|
|
$ins_char = $t[$xx ][$yy - 1];
|
|
$rep_char = $t[$xx - 1][$yy - 1];
|
|
|
|
if ($del_char !== 'd') {
|
|
$del_cost += $alt_cost;
|
|
}
|
|
if ($ins_char !== 'i') {
|
|
$ins_cost += $alt_cost;
|
|
}
|
|
if ($rep_char !== $rep_type) {
|
|
$rep_cost += $alt_cost;
|
|
}
|
|
}
|
|
|
|
if ($rep_cost <= $del_cost && $rep_cost <= $ins_cost) {
|
|
$cost = $rep_cost;
|
|
$type = $rep_type;
|
|
if ($rep_type === 's') {
|
|
$last_dy = $yy - 1;
|
|
}
|
|
} else if ($ins_cost <= $del_cost) {
|
|
$cost = $ins_cost;
|
|
$type = 'i';
|
|
} else {
|
|
$cost = $del_cost;
|
|
$type = 'd';
|
|
}
|
|
|
|
if ($use_damerau) {
|
|
$trn_count = (($xx - $dx - 2) + ($yy - $dy - 2) + 1);
|
|
$trn_cost = $m[$dx][$dy] + ($trn_count * $this->transposeCost);
|
|
if ($trn_cost < $cost) {
|
|
$cost = $trn_cost;
|
|
$type = array($dx + 1, $dy + 1, $type);
|
|
}
|
|
}
|
|
|
|
$m[$xx][$yy] = $cost;
|
|
if ($use_types) {
|
|
$t[$xx][$yy] = $type;
|
|
}
|
|
}
|
|
|
|
if ($use_damerau) {
|
|
$d[$x[$xx - 1]] = ($xx - 1);
|
|
}
|
|
}
|
|
|
|
$this->distanceMatrix = $m;
|
|
if ($use_types) {
|
|
$this->typeMatrix = $t;
|
|
}
|
|
}
|
|
|
|
private function getInfinity() {
|
|
return INF;
|
|
}
|
|
|
|
private function printMatrix(array $m) {
|
|
$x = $this->x;
|
|
$y = $this->y;
|
|
|
|
$p = '% 8s ';
|
|
printf($p, ' ');
|
|
foreach (head($m) as $yk => $yv) {
|
|
printf($p, idx($y, $yk - 1, '-'));
|
|
}
|
|
echo "\n";
|
|
foreach ($m as $xk => $xv) {
|
|
printf($p, idx($x, $xk - 1, '-'));
|
|
foreach ($xv as $yk => $yv) {
|
|
printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv));
|
|
}
|
|
echo "\n";
|
|
}
|
|
}
|
|
|
|
private function printTypeMatrix(array $t) {
|
|
$x = $this->x;
|
|
$y = $this->y;
|
|
|
|
$p = '% 8s ';
|
|
printf($p, ' ');
|
|
foreach (head($t) as $yk => $yv) {
|
|
printf($p, idx($y, $yk - 1, '-'));
|
|
}
|
|
echo "\n";
|
|
foreach ($t as $xk => $xv) {
|
|
printf($p, idx($x, $xk - 1, '-'));
|
|
foreach ($xv as $yk => $yv) {
|
|
printf($p, ($yv == $this->getInfinity() ? 'inf' : $yv));
|
|
}
|
|
echo "\n";
|
|
}
|
|
}
|
|
|
|
private function applySmoothing($str, $full) {
|
|
if ($full) {
|
|
$prefix = '(^|[xdi])';
|
|
$suffix = '([xdi]|\z)';
|
|
} else {
|
|
$prefix = '([xdi])';
|
|
$suffix = '([xdi])';
|
|
}
|
|
|
|
// Smooth the string out, by replacing short runs of similar characters
|
|
// with 'x' operations. This makes the result more readable to humans,
|
|
// since there are fewer choppy runs of short added and removed substrings.
|
|
do {
|
|
$original = $str;
|
|
$str = preg_replace('/'.$prefix.'(s{3})'.$suffix.'/', '$1xxx$3', $str);
|
|
$str = preg_replace('/'.$prefix.'(s{2})'.$suffix.'/', '$1xx$3', $str);
|
|
$str = preg_replace('/'.$prefix.'(s{1})'.$suffix.'/', '$1x$3', $str);
|
|
} while ($str != $original);
|
|
|
|
return $str;
|
|
}
|
|
|
|
}
|