mirror of
https://we.phorge.it/source/phorge.git
synced 2025-01-20 19:51:08 +01:00
As Harbormaster logs are processed, build a sparse map of byte offsets to line numbers
Summary: Depends on D19138. Ref T13088. When we want to read the last part of a logfile //and show accurate line numbers//, we need to be able to get from byte offsets to line numbers somehow. Our fundamental unit must remain byte offsets, because a test can emit an arbitrarily long line, and we should accommodate it cleanly if a test emits 2GB of the letter "A". To support going from byte offsets to line numbers, compute a map with periodic line markers throughout the offsets of the file. From here, we can figure out the line numbers for arbitrary positions in the file with only a constant amount of work. Test Plan: Added unit tests; ran unit tests. Subscribers: PHID-OPKG-gm6ozazyms6q6i22gyam Maniphest Tasks: T13088 Differential Revision: https://secure.phabricator.com/D19139
This commit is contained in:
parent
d6311044bb
commit
6dc341be87
6 changed files with 260 additions and 7 deletions
2
resources/sql/autopatches/20180223.log.04.linemap.sql
Normal file
2
resources/sql/autopatches/20180223.log.04.linemap.sql
Normal file
|
@ -0,0 +1,2 @@
|
|||
ALTER TABLE {$NAMESPACE}_harbormaster.harbormaster_buildlog
|
||||
ADD lineMap LONGTEXT NOT NULL COLLATE {$COLLATE_TEXT};
|
|
@ -0,0 +1,2 @@
|
|||
UPDATE {$NAMESPACE}_harbormaster.harbormaster_buildlog
|
||||
SET lineMap = '[]' WHERE lineMap = '';
|
|
@ -1230,6 +1230,7 @@ phutil_register_library_map(array(
|
|||
'HarbormasterBuildLogDownloadController' => 'applications/harbormaster/controller/HarbormasterBuildLogDownloadController.php',
|
||||
'HarbormasterBuildLogPHIDType' => 'applications/harbormaster/phid/HarbormasterBuildLogPHIDType.php',
|
||||
'HarbormasterBuildLogQuery' => 'applications/harbormaster/query/HarbormasterBuildLogQuery.php',
|
||||
'HarbormasterBuildLogTestCase' => 'applications/harbormaster/__tests__/HarbormasterBuildLogTestCase.php',
|
||||
'HarbormasterBuildLogView' => 'applications/harbormaster/view/HarbormasterBuildLogView.php',
|
||||
'HarbormasterBuildLogViewController' => 'applications/harbormaster/controller/HarbormasterBuildLogViewController.php',
|
||||
'HarbormasterBuildMessage' => 'applications/harbormaster/storage/HarbormasterBuildMessage.php',
|
||||
|
@ -6518,6 +6519,7 @@ phutil_register_library_map(array(
|
|||
'HarbormasterBuildLogDownloadController' => 'HarbormasterController',
|
||||
'HarbormasterBuildLogPHIDType' => 'PhabricatorPHIDType',
|
||||
'HarbormasterBuildLogQuery' => 'PhabricatorCursorPagedPolicyAwareQuery',
|
||||
'HarbormasterBuildLogTestCase' => 'PhabricatorTestCase',
|
||||
'HarbormasterBuildLogView' => 'AphrontView',
|
||||
'HarbormasterBuildLogViewController' => 'HarbormasterController',
|
||||
'HarbormasterBuildMessage' => array(
|
||||
|
|
|
@ -0,0 +1,117 @@
|
|||
<?php
|
||||
|
||||
final class HarbormasterBuildLogTestCase
|
||||
extends PhabricatorTestCase {
|
||||
|
||||
public function testBuildLogLineMaps() {
|
||||
$snowman = "\xE2\x98\x83";
|
||||
|
||||
$inputs = array(
|
||||
'no_newlines.log' => array(
|
||||
64,
|
||||
array(
|
||||
str_repeat('AAAAAAAA', 32),
|
||||
),
|
||||
array(
|
||||
array(64, 0),
|
||||
array(128, 0),
|
||||
array(192, 0),
|
||||
array(255, 0),
|
||||
),
|
||||
),
|
||||
'no_newlines_updated.log' => array(
|
||||
64,
|
||||
array_fill(0, 32, 'AAAAAAAA'),
|
||||
array(
|
||||
array(64, 0),
|
||||
array(128, 0),
|
||||
array(192, 0),
|
||||
),
|
||||
),
|
||||
'one_newline.log' => array(
|
||||
64,
|
||||
array(
|
||||
str_repeat('AAAAAAAA', 16),
|
||||
"\n",
|
||||
str_repeat('AAAAAAAA', 16),
|
||||
),
|
||||
array(
|
||||
array(64, 0),
|
||||
array(127, 0),
|
||||
array(191, 1),
|
||||
array(255, 1),
|
||||
),
|
||||
),
|
||||
'several_newlines.log' => array(
|
||||
64,
|
||||
array_fill(0, 12, "AAAAAAAAAAAAAAAAAA\n"),
|
||||
array(
|
||||
array(56, 2),
|
||||
array(113, 5),
|
||||
array(170, 8),
|
||||
array(227, 11),
|
||||
),
|
||||
),
|
||||
'mixed_newlines.log' => array(
|
||||
64,
|
||||
array(
|
||||
str_repeat('A', 63)."\r",
|
||||
str_repeat('A', 63)."\r\n",
|
||||
str_repeat('A', 63)."\n",
|
||||
str_repeat('A', 63),
|
||||
),
|
||||
array(
|
||||
array(63, 0),
|
||||
array(127, 1),
|
||||
array(191, 2),
|
||||
array(255, 3),
|
||||
),
|
||||
),
|
||||
'more_mixed_newlines.log' => array(
|
||||
64,
|
||||
array(
|
||||
str_repeat('A', 63)."\r",
|
||||
str_repeat('A', 62)."\r\n",
|
||||
str_repeat('A', 63)."\n",
|
||||
str_repeat('A', 63),
|
||||
),
|
||||
array(
|
||||
array(63, 0),
|
||||
array(128, 2),
|
||||
array(191, 2),
|
||||
array(254, 3),
|
||||
),
|
||||
),
|
||||
'emoji.log' => array(
|
||||
64,
|
||||
array(
|
||||
str_repeat($snowman, 64),
|
||||
),
|
||||
array(
|
||||
array(63, 0),
|
||||
array(126, 0),
|
||||
array(189, 0),
|
||||
),
|
||||
),
|
||||
);
|
||||
|
||||
foreach ($inputs as $label => $input) {
|
||||
list($distance, $parts, $expect) = $input;
|
||||
|
||||
$log = id(new HarbormasterBuildLog())
|
||||
->setByteLength(0);
|
||||
|
||||
foreach ($parts as $part) {
|
||||
$log->updateLineMap($part, $distance);
|
||||
}
|
||||
|
||||
list($actual) = $log->getLineMap();
|
||||
|
||||
$this->assertEqual(
|
||||
$expect,
|
||||
$actual,
|
||||
pht('Line Map for "%s"', $label));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -14,6 +14,7 @@ final class HarbormasterBuildLog
|
|||
protected $filePHID;
|
||||
protected $byteLength;
|
||||
protected $chunkFormat;
|
||||
protected $lineMap = array();
|
||||
|
||||
private $buildTarget = self::ATTACHABLE;
|
||||
private $rope;
|
||||
|
@ -64,6 +65,9 @@ final class HarbormasterBuildLog
|
|||
protected function getConfiguration() {
|
||||
return array(
|
||||
self::CONFIG_AUX_PHID => true,
|
||||
self::CONFIG_SERIALIZATION => array(
|
||||
'lineMap' => self::SERIALIZATION_JSON,
|
||||
),
|
||||
self::CONFIG_COLUMN_SCHEMA => array(
|
||||
// T6203/NULLABILITY
|
||||
// It seems like these should be non-nullable? All logs should have a
|
||||
|
@ -369,7 +373,8 @@ final class HarbormasterBuildLog
|
|||
$this->writeChunk($encoding_text, $data_size, $append_data);
|
||||
}
|
||||
|
||||
$this->byteLength += $data_size;
|
||||
$this->updateLineMap($append_data);
|
||||
|
||||
$this->save();
|
||||
$this->saveTransaction();
|
||||
|
||||
|
@ -377,6 +382,130 @@ final class HarbormasterBuildLog
|
|||
}
|
||||
}
|
||||
|
||||
public function updateLineMap($append_data, $marker_distance = null) {
|
||||
$this->byteLength += strlen($append_data);
|
||||
|
||||
if (!$marker_distance) {
|
||||
$marker_distance = (self::CHUNK_BYTE_LIMIT / 2);
|
||||
}
|
||||
|
||||
if (!$this->lineMap) {
|
||||
$this->lineMap = array(
|
||||
array(),
|
||||
0,
|
||||
0,
|
||||
null,
|
||||
);
|
||||
}
|
||||
|
||||
list($map, $map_bytes, $line_count, $prefix) = $this->lineMap;
|
||||
|
||||
$buffer = $append_data;
|
||||
|
||||
if ($prefix) {
|
||||
$prefix = base64_decode($prefix);
|
||||
$buffer = $prefix.$buffer;
|
||||
}
|
||||
|
||||
if ($map) {
|
||||
list($last_marker, $last_count) = last($map);
|
||||
} else {
|
||||
$last_marker = 0;
|
||||
$last_count = 0;
|
||||
}
|
||||
|
||||
$max_utf8_width = 8;
|
||||
$next_marker = $last_marker + $marker_distance;
|
||||
|
||||
$pos = 0;
|
||||
$len = strlen($buffer);
|
||||
while (true) {
|
||||
// If we only have a few bytes left in the buffer, leave it as a prefix
|
||||
// for next time.
|
||||
if (($len - $pos) <= ($max_utf8_width * 2)) {
|
||||
$prefix = substr($buffer, $pos);
|
||||
break;
|
||||
}
|
||||
|
||||
// The next slice we're going to look at is the smaller of:
|
||||
//
|
||||
// - the number of bytes we need to make it to the next marker; or
|
||||
// - all the bytes we have left, minus one.
|
||||
|
||||
$slice_length = min(
|
||||
($marker_distance - $map_bytes),
|
||||
($len - $pos) - 1);
|
||||
|
||||
// We don't slice all the way to the end for two reasons.
|
||||
|
||||
// First, we want to avoid slicing immediately after a "\r" if we don't
|
||||
// know what the next character is, because we want to make sure to
|
||||
// count "\r\n" as a single newline, rather than counting the "\r" as
|
||||
// a newline and then later counting the "\n" as another newline.
|
||||
|
||||
// Second, we don't want to slice in the middle of a UTF8 character if
|
||||
// we can help it. We may not be able to avoid this, since the whole
|
||||
// buffer may just be binary data, but in most cases we can backtrack
|
||||
// a little bit and try to make it out of emoji or other legitimate
|
||||
// multibyte UTF8 characters which appear in the log.
|
||||
|
||||
$min_width = max(1, $slice_length - $max_utf8_width);
|
||||
while ($slice_length >= $min_width) {
|
||||
$here = $buffer[$pos + ($slice_length - 1)];
|
||||
$next = $buffer[$pos + ($slice_length - 1) + 1];
|
||||
|
||||
// If this is "\r" and the next character is "\n", extend the slice
|
||||
// to include the "\n". Otherwise, we're fine to slice here since we
|
||||
// know we're not in the middle of a UTF8 character.
|
||||
if ($here === "\r") {
|
||||
if ($next === "\n") {
|
||||
$slice_length++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// If the next character is 0x7F or lower, or between 0xC2 and 0xF4,
|
||||
// we're not slicing in the middle of a UTF8 character.
|
||||
$ord = ord($next);
|
||||
if ($ord <= 0x7F || ($ord >= 0xC2 && $ord <= 0xF4)) {
|
||||
break;
|
||||
}
|
||||
|
||||
$slice_length--;
|
||||
}
|
||||
|
||||
$slice = substr($buffer, $pos, $slice_length);
|
||||
$pos += $slice_length;
|
||||
|
||||
$map_bytes += $slice_length;
|
||||
$line_count += count(preg_split("/\r\n|\r|\n/", $slice)) - 1;
|
||||
|
||||
if ($map_bytes >= ($marker_distance - $max_utf8_width)) {
|
||||
$map[] = array(
|
||||
$last_marker + $map_bytes,
|
||||
$last_count + $line_count,
|
||||
);
|
||||
|
||||
$last_count = $last_count + $line_count;
|
||||
$line_count = 0;
|
||||
|
||||
$last_marker = $last_marker + $map_bytes;
|
||||
$map_bytes = 0;
|
||||
|
||||
$next_marker = $last_marker + $marker_distance;
|
||||
}
|
||||
}
|
||||
|
||||
$this->lineMap = array(
|
||||
$map,
|
||||
$map_bytes,
|
||||
$line_count,
|
||||
base64_encode($prefix),
|
||||
);
|
||||
|
||||
return $this;
|
||||
}
|
||||
|
||||
|
||||
/* -( PhabricatorPolicyInterface )----------------------------------------- */
|
||||
|
||||
|
|
|
@ -57,17 +57,18 @@ final class HarbormasterLogWorker extends HarbormasterWorker {
|
|||
$data = $this->getTaskData();
|
||||
$is_force = idx($data, 'force');
|
||||
|
||||
if (!$log->getByteLength() || $is_force) {
|
||||
if (!$log->getByteLength() || !$log->getLineMap() || $is_force) {
|
||||
$iterator = $log->newDataIterator();
|
||||
|
||||
$byte_length = 0;
|
||||
$log
|
||||
->setByteLength(0)
|
||||
->setLineMap(array());
|
||||
|
||||
foreach ($iterator as $block) {
|
||||
$byte_length += strlen($block);
|
||||
$log->updateLineMap($block);
|
||||
}
|
||||
|
||||
$log
|
||||
->setByteLength($byte_length)
|
||||
->save();
|
||||
$log->save();
|
||||
}
|
||||
|
||||
$format_text = HarbormasterBuildLogChunk::CHUNK_ENCODING_TEXT;
|
||||
|
|
Loading…
Reference in a new issue