mirror of
https://we.phorge.it/source/phorge.git
synced 2024-11-09 16:32:39 +01:00
As Harbormaster logs are processed, build a sparse map of byte offsets to line numbers
Summary: Depends on D19138. Ref T13088. When we want to read the last part of a logfile //and show accurate line numbers//, we need to be able to get from byte offsets to line numbers somehow. Our fundamental unit must remain byte offsets, because a test can emit an arbitrarily long line, and we should accommodate it cleanly if a test emits 2GB of the letter "A". To support going from byte offsets to line numbers, compute a map with periodic line markers throughout the offsets of the file. From here, we can figure out the line numbers for arbitrary positions in the file with only a constant amount of work. Test Plan: Added unit tests; ran unit tests. Subscribers: PHID-OPKG-gm6ozazyms6q6i22gyam Maniphest Tasks: T13088 Differential Revision: https://secure.phabricator.com/D19139
This commit is contained in:
parent
d6311044bb
commit
6dc341be87
6 changed files with 260 additions and 7 deletions
2
resources/sql/autopatches/20180223.log.04.linemap.sql
Normal file
2
resources/sql/autopatches/20180223.log.04.linemap.sql
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
ALTER TABLE {$NAMESPACE}_harbormaster.harbormaster_buildlog
|
||||||
|
ADD lineMap LONGTEXT NOT NULL COLLATE {$COLLATE_TEXT};
|
|
@ -0,0 +1,2 @@
|
||||||
|
UPDATE {$NAMESPACE}_harbormaster.harbormaster_buildlog
|
||||||
|
SET lineMap = '[]' WHERE lineMap = '';
|
|
@ -1230,6 +1230,7 @@ phutil_register_library_map(array(
|
||||||
'HarbormasterBuildLogDownloadController' => 'applications/harbormaster/controller/HarbormasterBuildLogDownloadController.php',
|
'HarbormasterBuildLogDownloadController' => 'applications/harbormaster/controller/HarbormasterBuildLogDownloadController.php',
|
||||||
'HarbormasterBuildLogPHIDType' => 'applications/harbormaster/phid/HarbormasterBuildLogPHIDType.php',
|
'HarbormasterBuildLogPHIDType' => 'applications/harbormaster/phid/HarbormasterBuildLogPHIDType.php',
|
||||||
'HarbormasterBuildLogQuery' => 'applications/harbormaster/query/HarbormasterBuildLogQuery.php',
|
'HarbormasterBuildLogQuery' => 'applications/harbormaster/query/HarbormasterBuildLogQuery.php',
|
||||||
|
'HarbormasterBuildLogTestCase' => 'applications/harbormaster/__tests__/HarbormasterBuildLogTestCase.php',
|
||||||
'HarbormasterBuildLogView' => 'applications/harbormaster/view/HarbormasterBuildLogView.php',
|
'HarbormasterBuildLogView' => 'applications/harbormaster/view/HarbormasterBuildLogView.php',
|
||||||
'HarbormasterBuildLogViewController' => 'applications/harbormaster/controller/HarbormasterBuildLogViewController.php',
|
'HarbormasterBuildLogViewController' => 'applications/harbormaster/controller/HarbormasterBuildLogViewController.php',
|
||||||
'HarbormasterBuildMessage' => 'applications/harbormaster/storage/HarbormasterBuildMessage.php',
|
'HarbormasterBuildMessage' => 'applications/harbormaster/storage/HarbormasterBuildMessage.php',
|
||||||
|
@ -6518,6 +6519,7 @@ phutil_register_library_map(array(
|
||||||
'HarbormasterBuildLogDownloadController' => 'HarbormasterController',
|
'HarbormasterBuildLogDownloadController' => 'HarbormasterController',
|
||||||
'HarbormasterBuildLogPHIDType' => 'PhabricatorPHIDType',
|
'HarbormasterBuildLogPHIDType' => 'PhabricatorPHIDType',
|
||||||
'HarbormasterBuildLogQuery' => 'PhabricatorCursorPagedPolicyAwareQuery',
|
'HarbormasterBuildLogQuery' => 'PhabricatorCursorPagedPolicyAwareQuery',
|
||||||
|
'HarbormasterBuildLogTestCase' => 'PhabricatorTestCase',
|
||||||
'HarbormasterBuildLogView' => 'AphrontView',
|
'HarbormasterBuildLogView' => 'AphrontView',
|
||||||
'HarbormasterBuildLogViewController' => 'HarbormasterController',
|
'HarbormasterBuildLogViewController' => 'HarbormasterController',
|
||||||
'HarbormasterBuildMessage' => array(
|
'HarbormasterBuildMessage' => array(
|
||||||
|
|
|
@ -0,0 +1,117 @@
|
||||||
|
<?php
|
||||||
|
|
||||||
|
final class HarbormasterBuildLogTestCase
|
||||||
|
extends PhabricatorTestCase {
|
||||||
|
|
||||||
|
public function testBuildLogLineMaps() {
|
||||||
|
$snowman = "\xE2\x98\x83";
|
||||||
|
|
||||||
|
$inputs = array(
|
||||||
|
'no_newlines.log' => array(
|
||||||
|
64,
|
||||||
|
array(
|
||||||
|
str_repeat('AAAAAAAA', 32),
|
||||||
|
),
|
||||||
|
array(
|
||||||
|
array(64, 0),
|
||||||
|
array(128, 0),
|
||||||
|
array(192, 0),
|
||||||
|
array(255, 0),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
'no_newlines_updated.log' => array(
|
||||||
|
64,
|
||||||
|
array_fill(0, 32, 'AAAAAAAA'),
|
||||||
|
array(
|
||||||
|
array(64, 0),
|
||||||
|
array(128, 0),
|
||||||
|
array(192, 0),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
'one_newline.log' => array(
|
||||||
|
64,
|
||||||
|
array(
|
||||||
|
str_repeat('AAAAAAAA', 16),
|
||||||
|
"\n",
|
||||||
|
str_repeat('AAAAAAAA', 16),
|
||||||
|
),
|
||||||
|
array(
|
||||||
|
array(64, 0),
|
||||||
|
array(127, 0),
|
||||||
|
array(191, 1),
|
||||||
|
array(255, 1),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
'several_newlines.log' => array(
|
||||||
|
64,
|
||||||
|
array_fill(0, 12, "AAAAAAAAAAAAAAAAAA\n"),
|
||||||
|
array(
|
||||||
|
array(56, 2),
|
||||||
|
array(113, 5),
|
||||||
|
array(170, 8),
|
||||||
|
array(227, 11),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
'mixed_newlines.log' => array(
|
||||||
|
64,
|
||||||
|
array(
|
||||||
|
str_repeat('A', 63)."\r",
|
||||||
|
str_repeat('A', 63)."\r\n",
|
||||||
|
str_repeat('A', 63)."\n",
|
||||||
|
str_repeat('A', 63),
|
||||||
|
),
|
||||||
|
array(
|
||||||
|
array(63, 0),
|
||||||
|
array(127, 1),
|
||||||
|
array(191, 2),
|
||||||
|
array(255, 3),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
'more_mixed_newlines.log' => array(
|
||||||
|
64,
|
||||||
|
array(
|
||||||
|
str_repeat('A', 63)."\r",
|
||||||
|
str_repeat('A', 62)."\r\n",
|
||||||
|
str_repeat('A', 63)."\n",
|
||||||
|
str_repeat('A', 63),
|
||||||
|
),
|
||||||
|
array(
|
||||||
|
array(63, 0),
|
||||||
|
array(128, 2),
|
||||||
|
array(191, 2),
|
||||||
|
array(254, 3),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
'emoji.log' => array(
|
||||||
|
64,
|
||||||
|
array(
|
||||||
|
str_repeat($snowman, 64),
|
||||||
|
),
|
||||||
|
array(
|
||||||
|
array(63, 0),
|
||||||
|
array(126, 0),
|
||||||
|
array(189, 0),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
foreach ($inputs as $label => $input) {
|
||||||
|
list($distance, $parts, $expect) = $input;
|
||||||
|
|
||||||
|
$log = id(new HarbormasterBuildLog())
|
||||||
|
->setByteLength(0);
|
||||||
|
|
||||||
|
foreach ($parts as $part) {
|
||||||
|
$log->updateLineMap($part, $distance);
|
||||||
|
}
|
||||||
|
|
||||||
|
list($actual) = $log->getLineMap();
|
||||||
|
|
||||||
|
$this->assertEqual(
|
||||||
|
$expect,
|
||||||
|
$actual,
|
||||||
|
pht('Line Map for "%s"', $label));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -14,6 +14,7 @@ final class HarbormasterBuildLog
|
||||||
protected $filePHID;
|
protected $filePHID;
|
||||||
protected $byteLength;
|
protected $byteLength;
|
||||||
protected $chunkFormat;
|
protected $chunkFormat;
|
||||||
|
protected $lineMap = array();
|
||||||
|
|
||||||
private $buildTarget = self::ATTACHABLE;
|
private $buildTarget = self::ATTACHABLE;
|
||||||
private $rope;
|
private $rope;
|
||||||
|
@ -64,6 +65,9 @@ final class HarbormasterBuildLog
|
||||||
protected function getConfiguration() {
|
protected function getConfiguration() {
|
||||||
return array(
|
return array(
|
||||||
self::CONFIG_AUX_PHID => true,
|
self::CONFIG_AUX_PHID => true,
|
||||||
|
self::CONFIG_SERIALIZATION => array(
|
||||||
|
'lineMap' => self::SERIALIZATION_JSON,
|
||||||
|
),
|
||||||
self::CONFIG_COLUMN_SCHEMA => array(
|
self::CONFIG_COLUMN_SCHEMA => array(
|
||||||
// T6203/NULLABILITY
|
// T6203/NULLABILITY
|
||||||
// It seems like these should be non-nullable? All logs should have a
|
// It seems like these should be non-nullable? All logs should have a
|
||||||
|
@ -369,7 +373,8 @@ final class HarbormasterBuildLog
|
||||||
$this->writeChunk($encoding_text, $data_size, $append_data);
|
$this->writeChunk($encoding_text, $data_size, $append_data);
|
||||||
}
|
}
|
||||||
|
|
||||||
$this->byteLength += $data_size;
|
$this->updateLineMap($append_data);
|
||||||
|
|
||||||
$this->save();
|
$this->save();
|
||||||
$this->saveTransaction();
|
$this->saveTransaction();
|
||||||
|
|
||||||
|
@ -377,6 +382,130 @@ final class HarbormasterBuildLog
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function updateLineMap($append_data, $marker_distance = null) {
|
||||||
|
$this->byteLength += strlen($append_data);
|
||||||
|
|
||||||
|
if (!$marker_distance) {
|
||||||
|
$marker_distance = (self::CHUNK_BYTE_LIMIT / 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!$this->lineMap) {
|
||||||
|
$this->lineMap = array(
|
||||||
|
array(),
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
null,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
list($map, $map_bytes, $line_count, $prefix) = $this->lineMap;
|
||||||
|
|
||||||
|
$buffer = $append_data;
|
||||||
|
|
||||||
|
if ($prefix) {
|
||||||
|
$prefix = base64_decode($prefix);
|
||||||
|
$buffer = $prefix.$buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($map) {
|
||||||
|
list($last_marker, $last_count) = last($map);
|
||||||
|
} else {
|
||||||
|
$last_marker = 0;
|
||||||
|
$last_count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
$max_utf8_width = 8;
|
||||||
|
$next_marker = $last_marker + $marker_distance;
|
||||||
|
|
||||||
|
$pos = 0;
|
||||||
|
$len = strlen($buffer);
|
||||||
|
while (true) {
|
||||||
|
// If we only have a few bytes left in the buffer, leave it as a prefix
|
||||||
|
// for next time.
|
||||||
|
if (($len - $pos) <= ($max_utf8_width * 2)) {
|
||||||
|
$prefix = substr($buffer, $pos);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// The next slice we're going to look at is the smaller of:
|
||||||
|
//
|
||||||
|
// - the number of bytes we need to make it to the next marker; or
|
||||||
|
// - all the bytes we have left, minus one.
|
||||||
|
|
||||||
|
$slice_length = min(
|
||||||
|
($marker_distance - $map_bytes),
|
||||||
|
($len - $pos) - 1);
|
||||||
|
|
||||||
|
// We don't slice all the way to the end for two reasons.
|
||||||
|
|
||||||
|
// First, we want to avoid slicing immediately after a "\r" if we don't
|
||||||
|
// know what the next character is, because we want to make sure to
|
||||||
|
// count "\r\n" as a single newline, rather than counting the "\r" as
|
||||||
|
// a newline and then later counting the "\n" as another newline.
|
||||||
|
|
||||||
|
// Second, we don't want to slice in the middle of a UTF8 character if
|
||||||
|
// we can help it. We may not be able to avoid this, since the whole
|
||||||
|
// buffer may just be binary data, but in most cases we can backtrack
|
||||||
|
// a little bit and try to make it out of emoji or other legitimate
|
||||||
|
// multibyte UTF8 characters which appear in the log.
|
||||||
|
|
||||||
|
$min_width = max(1, $slice_length - $max_utf8_width);
|
||||||
|
while ($slice_length >= $min_width) {
|
||||||
|
$here = $buffer[$pos + ($slice_length - 1)];
|
||||||
|
$next = $buffer[$pos + ($slice_length - 1) + 1];
|
||||||
|
|
||||||
|
// If this is "\r" and the next character is "\n", extend the slice
|
||||||
|
// to include the "\n". Otherwise, we're fine to slice here since we
|
||||||
|
// know we're not in the middle of a UTF8 character.
|
||||||
|
if ($here === "\r") {
|
||||||
|
if ($next === "\n") {
|
||||||
|
$slice_length++;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the next character is 0x7F or lower, or between 0xC2 and 0xF4,
|
||||||
|
// we're not slicing in the middle of a UTF8 character.
|
||||||
|
$ord = ord($next);
|
||||||
|
if ($ord <= 0x7F || ($ord >= 0xC2 && $ord <= 0xF4)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
$slice_length--;
|
||||||
|
}
|
||||||
|
|
||||||
|
$slice = substr($buffer, $pos, $slice_length);
|
||||||
|
$pos += $slice_length;
|
||||||
|
|
||||||
|
$map_bytes += $slice_length;
|
||||||
|
$line_count += count(preg_split("/\r\n|\r|\n/", $slice)) - 1;
|
||||||
|
|
||||||
|
if ($map_bytes >= ($marker_distance - $max_utf8_width)) {
|
||||||
|
$map[] = array(
|
||||||
|
$last_marker + $map_bytes,
|
||||||
|
$last_count + $line_count,
|
||||||
|
);
|
||||||
|
|
||||||
|
$last_count = $last_count + $line_count;
|
||||||
|
$line_count = 0;
|
||||||
|
|
||||||
|
$last_marker = $last_marker + $map_bytes;
|
||||||
|
$map_bytes = 0;
|
||||||
|
|
||||||
|
$next_marker = $last_marker + $marker_distance;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->lineMap = array(
|
||||||
|
$map,
|
||||||
|
$map_bytes,
|
||||||
|
$line_count,
|
||||||
|
base64_encode($prefix),
|
||||||
|
);
|
||||||
|
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* -( PhabricatorPolicyInterface )----------------------------------------- */
|
/* -( PhabricatorPolicyInterface )----------------------------------------- */
|
||||||
|
|
||||||
|
|
|
@ -57,17 +57,18 @@ final class HarbormasterLogWorker extends HarbormasterWorker {
|
||||||
$data = $this->getTaskData();
|
$data = $this->getTaskData();
|
||||||
$is_force = idx($data, 'force');
|
$is_force = idx($data, 'force');
|
||||||
|
|
||||||
if (!$log->getByteLength() || $is_force) {
|
if (!$log->getByteLength() || !$log->getLineMap() || $is_force) {
|
||||||
$iterator = $log->newDataIterator();
|
$iterator = $log->newDataIterator();
|
||||||
|
|
||||||
$byte_length = 0;
|
$log
|
||||||
|
->setByteLength(0)
|
||||||
|
->setLineMap(array());
|
||||||
|
|
||||||
foreach ($iterator as $block) {
|
foreach ($iterator as $block) {
|
||||||
$byte_length += strlen($block);
|
$log->updateLineMap($block);
|
||||||
}
|
}
|
||||||
|
|
||||||
$log
|
$log->save();
|
||||||
->setByteLength($byte_length)
|
|
||||||
->save();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$format_text = HarbormasterBuildLogChunk::CHUNK_ENCODING_TEXT;
|
$format_text = HarbormasterBuildLogChunk::CHUNK_ENCODING_TEXT;
|
||||||
|
|
Loading…
Reference in a new issue