1
0
Fork 0
mirror of https://we.phorge.it/source/arcanist.git synced 2024-11-22 23:02:41 +01:00
phorge-arcanist/scripts/utils/utf8.php

171 lines
3.7 KiB
PHP
Raw Normal View History

[Wilds] Remove libphutil Summary: Ref T13098. Historically, Phabricator was split into three parts: - Phabricator, the server. - Arcanist, the client. - libphutil, libraries shared between the client and server. One imagined use case for this was that `libphutil` might become a general-purpose library that other projects would use. However, this didn't really happen, and it seems unlikely to at this point: Phabricator has become a relatively more sophisticated application platform; we didn't end up seeing or encouraging much custom development; what custom development there is basically embraces all of Phabricator since there are huge advantages to doing so; and a general "open source is awful" sort of factor here in the sense that open source users often don't have goals well aligned to our goals. Turning "arc" into a client platform and building package management solidify us in this direction of being a standalone platform, not a standalone utility library. Phabricator also depends on `arcanist/`. If it didn't, there would be a small advantage to saying "shared code + client for client, shared code + server for server", but there's no such distinction and it seems unlikely that one will ever exist. Even if it did, I think this has little value. Nowadays, I think this separation has no advantages for us and one significant cost: it makes installing `arcanist` more difficult for end-users. This will need some more finesssing (Phabricator will need some changes for compatibility, and a lot of stuff that still says "libphutil" or "phutil" may eventually want to say "arcanist"), and some stuff (like xhpast) is probably straight-up broken right now and needs some tweaking, but I don't anticipate any major issues here. There was never anything particularly magical about libphutil as a separate standalone library. Test Plan: Ran `arc`, it gets about as far as it did before. Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T13098 Differential Revision: https://secure.phabricator.com/D19688
2018-09-18 19:37:45 +02:00
#!/usr/bin/env php
<?php
require_once dirname(dirname(__FILE__)).'/__init_script__.php';
$args = new PhutilArgumentParser($argv);
$args->setTagline(pht('utf8 charset test script'));
$args->setSynopsis(<<<EOHELP
**utf8.php** [-C n] __file__ ...
Show regions in files which are not valid UTF-8. With "-C n",
show __n__ lines of context instead of the default of 3. Use
"-" to read stdin.
**utf8.php** --test __file__ ...
Test for files which are not valid UTF-8. For example, this
will find all ".php" files under the working directory which
aren't valid UTF-8:
find . -type f -name '*.php' | xargs -n256 ./utf8.php -t
If the script exits with no output, all input files were
valid UTF-8.
EOHELP
);
$args->parseStandardArguments();
$args->parse(array(
array(
'name' => 'context',
'short' => 'C',
'param' => 'lines',
'default' => 3,
'help' => pht(
'Show __lines__ lines of context instead of the default 3.'),
'conflicts' => array(
'test' => pht('with %s, context is not shown.', '--test'),
),
),
array(
'name' => 'test',
'short' => 't',
'help' => pht('Print file names containing invalid UTF-8 to stdout.'),
),
array(
'name' => 'files',
'wildcard' => true,
),
));
$is_test = $args->getArg('test');
$context = $args->getArg('context');
$files = $args->getArg('files');
if (empty($files)) {
$args->printHelpAndExit();
}
if ($is_test) {
$err = test($files);
} else {
$err = show($files, $context);
}
exit($err);
function read($file) {
if ($file === '-') {
return file_get_contents('php://stdin');
} else {
return Filesystem::readFile($file);
}
}
function name($file) {
if ($file === '-') {
return 'stdin';
} else {
return $file;
}
}
function test(array $files) {
foreach ($files as $file) {
$data = read($file);
if (!phutil_is_utf8($data)) {
echo name($file)."\n";
}
}
return 0;
}
function show(array $files, $context) {
foreach ($files as $file) {
$data = read($file);
$ok = phutil_is_utf8($data);
if ($ok) {
echo pht('OKAY');
} else {
echo pht('FAIL');
}
echo ' '.name($file)."\n";
if (!$ok) {
$lines = explode("\n", $data);
$len = count($lines);
$map = array();
$bad = array();
foreach ($lines as $n => $line) {
if (phutil_is_utf8($line)) {
continue;
}
$bad[$n] = true;
for ($jj = max(0, $n - $context);
$jj < min($len, $n + 1 + $context);
$jj++) {
$map[$jj] = true;
}
}
$width = strlen(max(array_keys($map)));
// Set $last such that we print a newline on the first iteration through
// the loop.
$last = -2;
foreach ($map as $idx => $ignored) {
if ($idx !== $last + 1) {
echo "\n";
}
$last = $idx;
$line = $lines[$idx];
if (!empty($bad[$idx])) {
$line = show_problems($line);
}
printf(" % {$width}d %s\n", $idx + 1, $line);
}
echo "\n";
}
}
return 0;
}
function show_problems($line) {
$regex =
"/^(".
"[\x01-\x7F]+".
"|([\xC2-\xDF][\x80-\xBF])".
"|([\xE0-\xEF][\x80-\xBF][\x80-\xBF])".
"|([\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]))/";
$out = '';
while (strlen($line)) {
$match = null;
if (preg_match($regex, $line, $match)) {
$out .= $match[1];
$line = substr($line, strlen($match[1]));
} else {
$chr = sprintf('<0x%0X>', ord($line[0]));
$chr = phutil_console_format('##%s##', $chr);
$out .= $chr;
$line = substr($line, 1);
}
}
return $out;
}