Make Differential linewrap utf-8 aware

Summary: Differential uses a byte-oriented linewrap algorithm. Instead, use a character-oriented one which will handle utf-8 properly. This implies a very slightly performance hit but we only run this code for lines which need to wrap, and the results get cached. It took about ~2.5ms for the test file on my machine. I'll keep an eye on it but I think it's currently a manageable cost. Test Plan: Diffed this file: https://secure.phabricator.com/P43 ...and got it to render like this: https://secure.phabricator.com/file/info/PHID-FILE-331ac241bede705b193b/ To do so, I had to disable the un-utf8 block which we can't actually do yet because of intraline diff, but it shows that once we can get rid of that it works completely correctly. It will "sort of" work in the meantime (nothing terrible happens). Reviewers: jungejason, aran, tuomaspelkonen CC: aran, epriestley Differential Revision: 513
2025-02-23 03:59:25 +01:00 · 2011-06-24 09:25:32 -07:00 · 2011-06-24 09:25:32 -07:00 · 5cfc14cb43
commit 5cfc14cb43
parent 1b55c4bdc9
1 changed files with 33 additions and 15 deletions
--- a/src/applications/differential/parser/changeset/DifferentialChangesetParser.php
+++ b/src/applications/differential/parser/changeset/DifferentialChangesetParser.php
@ -660,36 +660,54 @@ class DifferentialChangesetParser {
    }
  }

-  protected function lineWrap($l) {
+  /**
+   * Hard-wrap a piece of UTF-8 text with embedded HTML tags and entities.
+   *
+   * @param   string An HTML string with tags and entities.
+   * @return  string Hard-wrapped string.
+   */
+  protected function lineWrap($line) {
    $c = 0;
-    $len = strlen($l);
-    $ins = array();
+    $break_here = array();
+
+    // Convert the UTF-8 string into a list of UTF-8 characters.
+    $vector = phutil_utf8v($line);
+    $len = count($vector);
+    $byte_pos = 0;
    for ($ii = 0; $ii < $len; ++$ii) {
-      if ($l[$ii] == '&') {
+      // An ampersand indicates an HTML entity; consume the whole thing (until
+      // ";") but treat it all as one character.
+      if ($vector[$ii] == '&') {
        do {
          ++$ii;
-        } while ($l[$ii] != ';');
+        } while ($vector[$ii] != ';');
        ++$c;
-      } else if ($l[$ii] == '<') {
+      // An "<" indicates an HTML tag, consume the whole thing but don't treat
+      // it as a character.
+      } else if ($vector[$ii] == '<') {
        do {
          ++$ii;
-        } while ($l[$ii] != '>');
+        } while ($vector[$ii] != '>');
      } else {
        ++$c;
      }
+
+      // Keep track of where we need to break the string later.
      if ($c == $this->lineWidth) {
-        $ins[] = ($ii + 1);
+        $break_here[$ii] = true;
        $c = 0;
      }
    }
-    while (($pos = array_pop($ins))) {
-      $l = substr_replace(
-        $l,
-        "<span class=\"over-the-line\">\xE2\xAC\x85</span><br />",
-        $pos,
-        0);
+
+    $result = array();
+    foreach ($vector as $ii => $char) {
+      $result[] = $char;
+      if (isset($break_here[$ii])) {
+        $result[] = "<span class=\"over-the-line\">!</span><br />";
+      }
    }
-    return $l;
+
+    return implode('', $result);
  }