From 76ed0c7ff7be9d87dd32f14bf9ddb7379261a410 Mon Sep 17 00:00:00 2001 From: Andre Klapper Date: Fri, 10 Nov 2023 12:56:43 +0100 Subject: [PATCH] Disallow webcrawlers to follow Paste line number anchor links Summary: Paste provides line anchor links in every single line of a paste. If webcrawlers follow these links, they index the very same Paste again. Thus disallow in robots.txt to reduce unneeded traffic and indexing time. Closes T15662 Test Plan: Go to `/robots.txt` in the web browser. Cross fingers that more webcrawlers abide by RFC 9309. Reviewers: O1 Blessed Committers, valerio.bozzolan Reviewed By: O1 Blessed Committers, valerio.bozzolan Subscribers: tobiaswiese, valerio.bozzolan, Matthew, Cigaryno Maniphest Tasks: T15662 Differential Revision: https://we.phorge.it/D25461 --- .../robots/PhabricatorRobotsPlatformController.php | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/applications/system/controller/robots/PhabricatorRobotsPlatformController.php b/src/applications/system/controller/robots/PhabricatorRobotsPlatformController.php index b4a3c4fa37..82028918d7 100644 --- a/src/applications/system/controller/robots/PhabricatorRobotsPlatformController.php +++ b/src/applications/system/controller/robots/PhabricatorRobotsPlatformController.php @@ -19,6 +19,13 @@ final class PhabricatorRobotsPlatformController $out[] = 'Disallow: /diffusion/'; $out[] = 'Disallow: /source/'; + // See T15662. Prevent indexing line anchor links in Pastes. Per RFC 9309 + // section 2.2.3, percentage-encode "$" to avoid interpretation as end of + // match pattern. However, crawlers may not abide by it but follow the + // original standard at https://www.robotstxt.org/orig.html with no mention + // how to interpret characters like "$" and thus entirely ignore this rule. + $out[] = 'Disallow: /P*%24*'; + // Add a small crawl delay (number of seconds between requests) for spiders // which respect it. The intent here is to prevent spiders from affecting // performance for users. The possible cost is slower indexing, but that