diff --git a/src/applications/differential/parser/changeset/DifferentialChangesetParser.php b/src/applications/differential/parser/changeset/DifferentialChangesetParser.php
index 2e45839f14..67ecbfc556 100644
--- a/src/applications/differential/parser/changeset/DifferentialChangesetParser.php
+++ b/src/applications/differential/parser/changeset/DifferentialChangesetParser.php
@@ -186,14 +186,6 @@ class DifferentialChangesetParser {
$this->parsedHunk = true;
$lines = $hunk->getChanges();
- // Flatten UTF-8 into "\0". We don't support UTF-8 because the diffing
- // algorithms are byte-oriented (not character oriented) and everyone seems
- // to be in agreement that it's fairly reasonable not to allow UTF-8 in
- // source files. These bytes will later be replaced with a "?" glyph, but
- // in the meantime we replace them with "\0" since Pygments is happy to
- // deal with that.
- $lines = preg_replace('/[\x80-\xFF]/', "\0", $lines);
-
$lines = str_replace(
array("\t", "\r\n", "\r"),
array(' ', "\n", "\n"),
@@ -702,11 +694,18 @@ class DifferentialChangesetParser {
protected function tokenHighlight(&$render) {
+ // TODO: This is really terribly horrible and should be fixed. We have two
+ // byte-oriented algorithms (wordwrap and intraline diff) which are not
+ // unicode-aware and can accept a valid UTF-8 string but emit an invalid
+ // one by adding markup inside the byte sequences of characters. The right
+ // fix here is to make them UTF-8 aware. Short of that, we can repair the
+ // possibly-broken UTF-8 string into a valid UTF-8 string by replacing all
+ // UTF-8 bytes with a Unicode Replacement Character.
foreach ($render as $key => $text) {
- $render[$key] = str_replace(
- "\0",
- ''."\xEF\xBF\xBD".'',
- $text);
+ $render[$key] = preg_replace(
+ '/[\x80-\xFF]/',
+ ''."\xEF\xBF\xBD".'',
+ $text);
}
}