From 6dc341be87088984beb6ff4a267c4deff57a0bd5 Mon Sep 17 00:00:00 2001 From: epriestley Date: Fri, 23 Feb 2018 07:10:18 -0800 Subject: [PATCH] As Harbormaster logs are processed, build a sparse map of byte offsets to line numbers Summary: Depends on D19138. Ref T13088. When we want to read the last part of a logfile //and show accurate line numbers//, we need to be able to get from byte offsets to line numbers somehow. Our fundamental unit must remain byte offsets, because a test can emit an arbitrarily long line, and we should accommodate it cleanly if a test emits 2GB of the letter "A". To support going from byte offsets to line numbers, compute a map with periodic line markers throughout the offsets of the file. From here, we can figure out the line numbers for arbitrary positions in the file with only a constant amount of work. Test Plan: Added unit tests; ran unit tests. Subscribers: PHID-OPKG-gm6ozazyms6q6i22gyam Maniphest Tasks: T13088 Differential Revision: https://secure.phabricator.com/D19139 --- .../autopatches/20180223.log.04.linemap.sql | 2 + .../20180223.log.05.linemapdefault.sql | 2 + src/__phutil_library_map__.php | 2 + .../HarbormasterBuildLogTestCase.php | 117 ++++++++++++++++ .../storage/build/HarbormasterBuildLog.php | 131 +++++++++++++++++- .../worker/HarbormasterLogWorker.php | 13 +- 6 files changed, 260 insertions(+), 7 deletions(-) create mode 100644 resources/sql/autopatches/20180223.log.04.linemap.sql create mode 100644 resources/sql/autopatches/20180223.log.05.linemapdefault.sql create mode 100644 src/applications/harbormaster/__tests__/HarbormasterBuildLogTestCase.php diff --git a/resources/sql/autopatches/20180223.log.04.linemap.sql b/resources/sql/autopatches/20180223.log.04.linemap.sql new file mode 100644 index 0000000000..75ed27cf7c --- /dev/null +++ b/resources/sql/autopatches/20180223.log.04.linemap.sql @@ -0,0 +1,2 @@ +ALTER TABLE {$NAMESPACE}_harbormaster.harbormaster_buildlog + ADD lineMap LONGTEXT NOT NULL COLLATE {$COLLATE_TEXT}; diff --git a/resources/sql/autopatches/20180223.log.05.linemapdefault.sql b/resources/sql/autopatches/20180223.log.05.linemapdefault.sql new file mode 100644 index 0000000000..59b4dc9a62 --- /dev/null +++ b/resources/sql/autopatches/20180223.log.05.linemapdefault.sql @@ -0,0 +1,2 @@ +UPDATE {$NAMESPACE}_harbormaster.harbormaster_buildlog + SET lineMap = '[]' WHERE lineMap = ''; diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php index be9cb258cc..01407ed67b 100644 --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -1230,6 +1230,7 @@ phutil_register_library_map(array( 'HarbormasterBuildLogDownloadController' => 'applications/harbormaster/controller/HarbormasterBuildLogDownloadController.php', 'HarbormasterBuildLogPHIDType' => 'applications/harbormaster/phid/HarbormasterBuildLogPHIDType.php', 'HarbormasterBuildLogQuery' => 'applications/harbormaster/query/HarbormasterBuildLogQuery.php', + 'HarbormasterBuildLogTestCase' => 'applications/harbormaster/__tests__/HarbormasterBuildLogTestCase.php', 'HarbormasterBuildLogView' => 'applications/harbormaster/view/HarbormasterBuildLogView.php', 'HarbormasterBuildLogViewController' => 'applications/harbormaster/controller/HarbormasterBuildLogViewController.php', 'HarbormasterBuildMessage' => 'applications/harbormaster/storage/HarbormasterBuildMessage.php', @@ -6518,6 +6519,7 @@ phutil_register_library_map(array( 'HarbormasterBuildLogDownloadController' => 'HarbormasterController', 'HarbormasterBuildLogPHIDType' => 'PhabricatorPHIDType', 'HarbormasterBuildLogQuery' => 'PhabricatorCursorPagedPolicyAwareQuery', + 'HarbormasterBuildLogTestCase' => 'PhabricatorTestCase', 'HarbormasterBuildLogView' => 'AphrontView', 'HarbormasterBuildLogViewController' => 'HarbormasterController', 'HarbormasterBuildMessage' => array( diff --git a/src/applications/harbormaster/__tests__/HarbormasterBuildLogTestCase.php b/src/applications/harbormaster/__tests__/HarbormasterBuildLogTestCase.php new file mode 100644 index 0000000000..6cbdd6b0db --- /dev/null +++ b/src/applications/harbormaster/__tests__/HarbormasterBuildLogTestCase.php @@ -0,0 +1,117 @@ + array( + 64, + array( + str_repeat('AAAAAAAA', 32), + ), + array( + array(64, 0), + array(128, 0), + array(192, 0), + array(255, 0), + ), + ), + 'no_newlines_updated.log' => array( + 64, + array_fill(0, 32, 'AAAAAAAA'), + array( + array(64, 0), + array(128, 0), + array(192, 0), + ), + ), + 'one_newline.log' => array( + 64, + array( + str_repeat('AAAAAAAA', 16), + "\n", + str_repeat('AAAAAAAA', 16), + ), + array( + array(64, 0), + array(127, 0), + array(191, 1), + array(255, 1), + ), + ), + 'several_newlines.log' => array( + 64, + array_fill(0, 12, "AAAAAAAAAAAAAAAAAA\n"), + array( + array(56, 2), + array(113, 5), + array(170, 8), + array(227, 11), + ), + ), + 'mixed_newlines.log' => array( + 64, + array( + str_repeat('A', 63)."\r", + str_repeat('A', 63)."\r\n", + str_repeat('A', 63)."\n", + str_repeat('A', 63), + ), + array( + array(63, 0), + array(127, 1), + array(191, 2), + array(255, 3), + ), + ), + 'more_mixed_newlines.log' => array( + 64, + array( + str_repeat('A', 63)."\r", + str_repeat('A', 62)."\r\n", + str_repeat('A', 63)."\n", + str_repeat('A', 63), + ), + array( + array(63, 0), + array(128, 2), + array(191, 2), + array(254, 3), + ), + ), + 'emoji.log' => array( + 64, + array( + str_repeat($snowman, 64), + ), + array( + array(63, 0), + array(126, 0), + array(189, 0), + ), + ), + ); + + foreach ($inputs as $label => $input) { + list($distance, $parts, $expect) = $input; + + $log = id(new HarbormasterBuildLog()) + ->setByteLength(0); + + foreach ($parts as $part) { + $log->updateLineMap($part, $distance); + } + + list($actual) = $log->getLineMap(); + + $this->assertEqual( + $expect, + $actual, + pht('Line Map for "%s"', $label)); + } + } + +} diff --git a/src/applications/harbormaster/storage/build/HarbormasterBuildLog.php b/src/applications/harbormaster/storage/build/HarbormasterBuildLog.php index 6b0b15d8b1..9b983f0733 100644 --- a/src/applications/harbormaster/storage/build/HarbormasterBuildLog.php +++ b/src/applications/harbormaster/storage/build/HarbormasterBuildLog.php @@ -14,6 +14,7 @@ final class HarbormasterBuildLog protected $filePHID; protected $byteLength; protected $chunkFormat; + protected $lineMap = array(); private $buildTarget = self::ATTACHABLE; private $rope; @@ -64,6 +65,9 @@ final class HarbormasterBuildLog protected function getConfiguration() { return array( self::CONFIG_AUX_PHID => true, + self::CONFIG_SERIALIZATION => array( + 'lineMap' => self::SERIALIZATION_JSON, + ), self::CONFIG_COLUMN_SCHEMA => array( // T6203/NULLABILITY // It seems like these should be non-nullable? All logs should have a @@ -369,7 +373,8 @@ final class HarbormasterBuildLog $this->writeChunk($encoding_text, $data_size, $append_data); } - $this->byteLength += $data_size; + $this->updateLineMap($append_data); + $this->save(); $this->saveTransaction(); @@ -377,6 +382,130 @@ final class HarbormasterBuildLog } } + public function updateLineMap($append_data, $marker_distance = null) { + $this->byteLength += strlen($append_data); + + if (!$marker_distance) { + $marker_distance = (self::CHUNK_BYTE_LIMIT / 2); + } + + if (!$this->lineMap) { + $this->lineMap = array( + array(), + 0, + 0, + null, + ); + } + + list($map, $map_bytes, $line_count, $prefix) = $this->lineMap; + + $buffer = $append_data; + + if ($prefix) { + $prefix = base64_decode($prefix); + $buffer = $prefix.$buffer; + } + + if ($map) { + list($last_marker, $last_count) = last($map); + } else { + $last_marker = 0; + $last_count = 0; + } + + $max_utf8_width = 8; + $next_marker = $last_marker + $marker_distance; + + $pos = 0; + $len = strlen($buffer); + while (true) { + // If we only have a few bytes left in the buffer, leave it as a prefix + // for next time. + if (($len - $pos) <= ($max_utf8_width * 2)) { + $prefix = substr($buffer, $pos); + break; + } + + // The next slice we're going to look at is the smaller of: + // + // - the number of bytes we need to make it to the next marker; or + // - all the bytes we have left, minus one. + + $slice_length = min( + ($marker_distance - $map_bytes), + ($len - $pos) - 1); + + // We don't slice all the way to the end for two reasons. + + // First, we want to avoid slicing immediately after a "\r" if we don't + // know what the next character is, because we want to make sure to + // count "\r\n" as a single newline, rather than counting the "\r" as + // a newline and then later counting the "\n" as another newline. + + // Second, we don't want to slice in the middle of a UTF8 character if + // we can help it. We may not be able to avoid this, since the whole + // buffer may just be binary data, but in most cases we can backtrack + // a little bit and try to make it out of emoji or other legitimate + // multibyte UTF8 characters which appear in the log. + + $min_width = max(1, $slice_length - $max_utf8_width); + while ($slice_length >= $min_width) { + $here = $buffer[$pos + ($slice_length - 1)]; + $next = $buffer[$pos + ($slice_length - 1) + 1]; + + // If this is "\r" and the next character is "\n", extend the slice + // to include the "\n". Otherwise, we're fine to slice here since we + // know we're not in the middle of a UTF8 character. + if ($here === "\r") { + if ($next === "\n") { + $slice_length++; + } + break; + } + + // If the next character is 0x7F or lower, or between 0xC2 and 0xF4, + // we're not slicing in the middle of a UTF8 character. + $ord = ord($next); + if ($ord <= 0x7F || ($ord >= 0xC2 && $ord <= 0xF4)) { + break; + } + + $slice_length--; + } + + $slice = substr($buffer, $pos, $slice_length); + $pos += $slice_length; + + $map_bytes += $slice_length; + $line_count += count(preg_split("/\r\n|\r|\n/", $slice)) - 1; + + if ($map_bytes >= ($marker_distance - $max_utf8_width)) { + $map[] = array( + $last_marker + $map_bytes, + $last_count + $line_count, + ); + + $last_count = $last_count + $line_count; + $line_count = 0; + + $last_marker = $last_marker + $map_bytes; + $map_bytes = 0; + + $next_marker = $last_marker + $marker_distance; + } + } + + $this->lineMap = array( + $map, + $map_bytes, + $line_count, + base64_encode($prefix), + ); + + return $this; + } + /* -( PhabricatorPolicyInterface )----------------------------------------- */ diff --git a/src/applications/harbormaster/worker/HarbormasterLogWorker.php b/src/applications/harbormaster/worker/HarbormasterLogWorker.php index fc8762de0a..d9d7511c27 100644 --- a/src/applications/harbormaster/worker/HarbormasterLogWorker.php +++ b/src/applications/harbormaster/worker/HarbormasterLogWorker.php @@ -57,17 +57,18 @@ final class HarbormasterLogWorker extends HarbormasterWorker { $data = $this->getTaskData(); $is_force = idx($data, 'force'); - if (!$log->getByteLength() || $is_force) { + if (!$log->getByteLength() || !$log->getLineMap() || $is_force) { $iterator = $log->newDataIterator(); - $byte_length = 0; + $log + ->setByteLength(0) + ->setLineMap(array()); + foreach ($iterator as $block) { - $byte_length += strlen($block); + $log->updateLineMap($block); } - $log - ->setByteLength($byte_length) - ->save(); + $log->save(); } $format_text = HarbormasterBuildLogChunk::CHUNK_ENCODING_TEXT;