1
0
Fork 0
mirror of https://we.phorge.it/source/phorge.git synced 2025-01-07 05:11:05 +01:00

As Harbormaster logs are processed, build a sparse map of byte offsets to line numbers

Summary:
Depends on D19138. Ref T13088. When we want to read the last part of a logfile //and show accurate line numbers//, we need to be able to get from byte offsets to line numbers somehow.

Our fundamental unit must remain byte offsets, because a test can emit an arbitrarily long line, and we should accommodate it cleanly if a test emits 2GB of the letter "A".

To support going from byte offsets to line numbers, compute a map with periodic line markers throughout the offsets of the file. From here, we can figure out the line numbers for arbitrary positions in the file with only a constant amount of work.

Test Plan: Added unit tests; ran unit tests.

Subscribers: PHID-OPKG-gm6ozazyms6q6i22gyam

Maniphest Tasks: T13088

Differential Revision: https://secure.phabricator.com/D19139
This commit is contained in:
epriestley 2018-02-23 07:10:18 -08:00
parent d6311044bb
commit 6dc341be87
6 changed files with 260 additions and 7 deletions

View file

@ -0,0 +1,2 @@
ALTER TABLE {$NAMESPACE}_harbormaster.harbormaster_buildlog
ADD lineMap LONGTEXT NOT NULL COLLATE {$COLLATE_TEXT};

View file

@ -0,0 +1,2 @@
UPDATE {$NAMESPACE}_harbormaster.harbormaster_buildlog
SET lineMap = '[]' WHERE lineMap = '';

View file

@ -1230,6 +1230,7 @@ phutil_register_library_map(array(
'HarbormasterBuildLogDownloadController' => 'applications/harbormaster/controller/HarbormasterBuildLogDownloadController.php', 'HarbormasterBuildLogDownloadController' => 'applications/harbormaster/controller/HarbormasterBuildLogDownloadController.php',
'HarbormasterBuildLogPHIDType' => 'applications/harbormaster/phid/HarbormasterBuildLogPHIDType.php', 'HarbormasterBuildLogPHIDType' => 'applications/harbormaster/phid/HarbormasterBuildLogPHIDType.php',
'HarbormasterBuildLogQuery' => 'applications/harbormaster/query/HarbormasterBuildLogQuery.php', 'HarbormasterBuildLogQuery' => 'applications/harbormaster/query/HarbormasterBuildLogQuery.php',
'HarbormasterBuildLogTestCase' => 'applications/harbormaster/__tests__/HarbormasterBuildLogTestCase.php',
'HarbormasterBuildLogView' => 'applications/harbormaster/view/HarbormasterBuildLogView.php', 'HarbormasterBuildLogView' => 'applications/harbormaster/view/HarbormasterBuildLogView.php',
'HarbormasterBuildLogViewController' => 'applications/harbormaster/controller/HarbormasterBuildLogViewController.php', 'HarbormasterBuildLogViewController' => 'applications/harbormaster/controller/HarbormasterBuildLogViewController.php',
'HarbormasterBuildMessage' => 'applications/harbormaster/storage/HarbormasterBuildMessage.php', 'HarbormasterBuildMessage' => 'applications/harbormaster/storage/HarbormasterBuildMessage.php',
@ -6518,6 +6519,7 @@ phutil_register_library_map(array(
'HarbormasterBuildLogDownloadController' => 'HarbormasterController', 'HarbormasterBuildLogDownloadController' => 'HarbormasterController',
'HarbormasterBuildLogPHIDType' => 'PhabricatorPHIDType', 'HarbormasterBuildLogPHIDType' => 'PhabricatorPHIDType',
'HarbormasterBuildLogQuery' => 'PhabricatorCursorPagedPolicyAwareQuery', 'HarbormasterBuildLogQuery' => 'PhabricatorCursorPagedPolicyAwareQuery',
'HarbormasterBuildLogTestCase' => 'PhabricatorTestCase',
'HarbormasterBuildLogView' => 'AphrontView', 'HarbormasterBuildLogView' => 'AphrontView',
'HarbormasterBuildLogViewController' => 'HarbormasterController', 'HarbormasterBuildLogViewController' => 'HarbormasterController',
'HarbormasterBuildMessage' => array( 'HarbormasterBuildMessage' => array(

View file

@ -0,0 +1,117 @@
<?php
final class HarbormasterBuildLogTestCase
extends PhabricatorTestCase {
public function testBuildLogLineMaps() {
$snowman = "\xE2\x98\x83";
$inputs = array(
'no_newlines.log' => array(
64,
array(
str_repeat('AAAAAAAA', 32),
),
array(
array(64, 0),
array(128, 0),
array(192, 0),
array(255, 0),
),
),
'no_newlines_updated.log' => array(
64,
array_fill(0, 32, 'AAAAAAAA'),
array(
array(64, 0),
array(128, 0),
array(192, 0),
),
),
'one_newline.log' => array(
64,
array(
str_repeat('AAAAAAAA', 16),
"\n",
str_repeat('AAAAAAAA', 16),
),
array(
array(64, 0),
array(127, 0),
array(191, 1),
array(255, 1),
),
),
'several_newlines.log' => array(
64,
array_fill(0, 12, "AAAAAAAAAAAAAAAAAA\n"),
array(
array(56, 2),
array(113, 5),
array(170, 8),
array(227, 11),
),
),
'mixed_newlines.log' => array(
64,
array(
str_repeat('A', 63)."\r",
str_repeat('A', 63)."\r\n",
str_repeat('A', 63)."\n",
str_repeat('A', 63),
),
array(
array(63, 0),
array(127, 1),
array(191, 2),
array(255, 3),
),
),
'more_mixed_newlines.log' => array(
64,
array(
str_repeat('A', 63)."\r",
str_repeat('A', 62)."\r\n",
str_repeat('A', 63)."\n",
str_repeat('A', 63),
),
array(
array(63, 0),
array(128, 2),
array(191, 2),
array(254, 3),
),
),
'emoji.log' => array(
64,
array(
str_repeat($snowman, 64),
),
array(
array(63, 0),
array(126, 0),
array(189, 0),
),
),
);
foreach ($inputs as $label => $input) {
list($distance, $parts, $expect) = $input;
$log = id(new HarbormasterBuildLog())
->setByteLength(0);
foreach ($parts as $part) {
$log->updateLineMap($part, $distance);
}
list($actual) = $log->getLineMap();
$this->assertEqual(
$expect,
$actual,
pht('Line Map for "%s"', $label));
}
}
}

View file

@ -14,6 +14,7 @@ final class HarbormasterBuildLog
protected $filePHID; protected $filePHID;
protected $byteLength; protected $byteLength;
protected $chunkFormat; protected $chunkFormat;
protected $lineMap = array();
private $buildTarget = self::ATTACHABLE; private $buildTarget = self::ATTACHABLE;
private $rope; private $rope;
@ -64,6 +65,9 @@ final class HarbormasterBuildLog
protected function getConfiguration() { protected function getConfiguration() {
return array( return array(
self::CONFIG_AUX_PHID => true, self::CONFIG_AUX_PHID => true,
self::CONFIG_SERIALIZATION => array(
'lineMap' => self::SERIALIZATION_JSON,
),
self::CONFIG_COLUMN_SCHEMA => array( self::CONFIG_COLUMN_SCHEMA => array(
// T6203/NULLABILITY // T6203/NULLABILITY
// It seems like these should be non-nullable? All logs should have a // It seems like these should be non-nullable? All logs should have a
@ -369,7 +373,8 @@ final class HarbormasterBuildLog
$this->writeChunk($encoding_text, $data_size, $append_data); $this->writeChunk($encoding_text, $data_size, $append_data);
} }
$this->byteLength += $data_size; $this->updateLineMap($append_data);
$this->save(); $this->save();
$this->saveTransaction(); $this->saveTransaction();
@ -377,6 +382,130 @@ final class HarbormasterBuildLog
} }
} }
public function updateLineMap($append_data, $marker_distance = null) {
$this->byteLength += strlen($append_data);
if (!$marker_distance) {
$marker_distance = (self::CHUNK_BYTE_LIMIT / 2);
}
if (!$this->lineMap) {
$this->lineMap = array(
array(),
0,
0,
null,
);
}
list($map, $map_bytes, $line_count, $prefix) = $this->lineMap;
$buffer = $append_data;
if ($prefix) {
$prefix = base64_decode($prefix);
$buffer = $prefix.$buffer;
}
if ($map) {
list($last_marker, $last_count) = last($map);
} else {
$last_marker = 0;
$last_count = 0;
}
$max_utf8_width = 8;
$next_marker = $last_marker + $marker_distance;
$pos = 0;
$len = strlen($buffer);
while (true) {
// If we only have a few bytes left in the buffer, leave it as a prefix
// for next time.
if (($len - $pos) <= ($max_utf8_width * 2)) {
$prefix = substr($buffer, $pos);
break;
}
// The next slice we're going to look at is the smaller of:
//
// - the number of bytes we need to make it to the next marker; or
// - all the bytes we have left, minus one.
$slice_length = min(
($marker_distance - $map_bytes),
($len - $pos) - 1);
// We don't slice all the way to the end for two reasons.
// First, we want to avoid slicing immediately after a "\r" if we don't
// know what the next character is, because we want to make sure to
// count "\r\n" as a single newline, rather than counting the "\r" as
// a newline and then later counting the "\n" as another newline.
// Second, we don't want to slice in the middle of a UTF8 character if
// we can help it. We may not be able to avoid this, since the whole
// buffer may just be binary data, but in most cases we can backtrack
// a little bit and try to make it out of emoji or other legitimate
// multibyte UTF8 characters which appear in the log.
$min_width = max(1, $slice_length - $max_utf8_width);
while ($slice_length >= $min_width) {
$here = $buffer[$pos + ($slice_length - 1)];
$next = $buffer[$pos + ($slice_length - 1) + 1];
// If this is "\r" and the next character is "\n", extend the slice
// to include the "\n". Otherwise, we're fine to slice here since we
// know we're not in the middle of a UTF8 character.
if ($here === "\r") {
if ($next === "\n") {
$slice_length++;
}
break;
}
// If the next character is 0x7F or lower, or between 0xC2 and 0xF4,
// we're not slicing in the middle of a UTF8 character.
$ord = ord($next);
if ($ord <= 0x7F || ($ord >= 0xC2 && $ord <= 0xF4)) {
break;
}
$slice_length--;
}
$slice = substr($buffer, $pos, $slice_length);
$pos += $slice_length;
$map_bytes += $slice_length;
$line_count += count(preg_split("/\r\n|\r|\n/", $slice)) - 1;
if ($map_bytes >= ($marker_distance - $max_utf8_width)) {
$map[] = array(
$last_marker + $map_bytes,
$last_count + $line_count,
);
$last_count = $last_count + $line_count;
$line_count = 0;
$last_marker = $last_marker + $map_bytes;
$map_bytes = 0;
$next_marker = $last_marker + $marker_distance;
}
}
$this->lineMap = array(
$map,
$map_bytes,
$line_count,
base64_encode($prefix),
);
return $this;
}
/* -( PhabricatorPolicyInterface )----------------------------------------- */ /* -( PhabricatorPolicyInterface )----------------------------------------- */

View file

@ -57,17 +57,18 @@ final class HarbormasterLogWorker extends HarbormasterWorker {
$data = $this->getTaskData(); $data = $this->getTaskData();
$is_force = idx($data, 'force'); $is_force = idx($data, 'force');
if (!$log->getByteLength() || $is_force) { if (!$log->getByteLength() || !$log->getLineMap() || $is_force) {
$iterator = $log->newDataIterator(); $iterator = $log->newDataIterator();
$byte_length = 0; $log
->setByteLength(0)
->setLineMap(array());
foreach ($iterator as $block) { foreach ($iterator as $block) {
$byte_length += strlen($block); $log->updateLineMap($block);
} }
$log $log->save();
->setByteLength($byte_length)
->save();
} }
$format_text = HarbormasterBuildLogChunk::CHUNK_ENCODING_TEXT; $format_text = HarbormasterBuildLogChunk::CHUNK_ENCODING_TEXT;