sandbox/PhabExt/PhabricatorCustomRobotsTxtController.php
revi 8084acebfa
Phorge(robots.txt): add applebot-extended to disallow
Summary:
Ref: https://support.apple.com/en-us/119829#datausage

Signed-off-by: Yongmin Hong <revi@omglol.email>

Test Plan: Verify the user-agent matches.

Reviewers: O1 revi & automations, revi

Reviewed By: O1 revi & automations, revi

Differential Revision: https://issuetracker.revi.xyz/D508
2024-07-04 00:12:13 +09:00

190 lines
6.6 KiB
PHP

<?php
// Copyright (C) 2024 Hong Yongmin <https://revi.xyz/>
// SPDX-License-Identifier: Apache-2.0
abstract class PhabricatorRobotsController extends PhabricatorController {
public function shouldRequireLogin() {
return false;
}
// TODO: Different content for cdn domains
public function processRequest() {
$out = array();
$out[] = '# Adapted from phabricator.wikimedia.org, we.phorge.it';
// Version timestamp is when I started editing them.
// Edit setLastModified at the bottom as well.
// Calculate EpochTime via go/epoch
$out[] = '# version: 20240703T230700+0900';
$out[] = '# also at https://github.com/revi/sandbox.git';
$out[] = 'User-Agent: *';
$out[] = 'Disallow: /diffusion/';
$out[] = 'Disallow: /diffusion';
$out[] = 'Disallow: /source/';
$out[] = 'Disallow: /source';
$out[] = 'Disallow: /multimeter/';
$out[] = 'Disallow: /multimeter';
$out[] = 'Disallow: /policy/explain';
$out[] = 'Disallow: /auth/';
$out[] = 'Disallow: /auth';
$out[] = 'Disallow: /login/';
$out[] = 'Disallow: /login';
$out[] = 'Disallow: /maniphest/transaction/';
$out[] = 'Disallow: /maniphest/transaction';
$out[] = 'Disallow: /tag/';
$out[] = 'Disallow: /tag';
$out[] = 'Disallow: /search/';
$out[] = 'Disallow: /search';
$out[] = 'Disallow: /conduit/';
$out[] = 'Disallow: /conduit';
$out[] = 'Disallow: /api/';
$out[] = 'Disallow: /api';
$out[] = 'Disallow: /project/';
$out[] = 'Disallow: /project';
$out[] = 'Disallow: /applications/';
$out[] = 'Disallow: /applications';
$out[] = 'Disallow: /token/';
$out[] = 'Disallow: /token';
$out[] = 'Disallow: /pholio/';
$out[] = 'Disallow: /pholio';
$out[] = 'Disallow: /dashboard/';
$out[] = 'Disallow: /dashboard';
$out[] = 'Disallow: /calendar';
$out[] = 'Disallow: /herald/';
$out[] = 'Disallow: /herald';
// This is commits.
$out[] = 'Disallow: /r*';
// This is differential revisions. (D*)
$out[] = 'Disallow: /differential/';
$out[] = 'Disallow: /differential';
$out[] = 'Disallow: /D*';
// This is Files. (F*)
$out[] = 'Disallow: /file/';
$out[] = 'Disallow: /file';
$out[] = 'Disallow: /F*';
// This is pastes (P*)
$out[] = 'Disallow: /paste/';
$out[] = 'Disallow: /paste';
$out[] = 'Disallow: /P*';
// This is blog entries (J$)
$out[] = 'Disallow: /phame/';
$out[] = 'Disallow: /phame';
$out[] = 'Disallow: /J*';
// This is user list.
// As of 2024-04-17 user list is behind auth but who knows it might change?
$out[] = 'Disallow: /people/';
$out[] = 'Disallow: /people';
// This is user profile link.
$out[] = 'Disallow: /p/';
// Phorge specific entries end here.
$out[] = '# This is cloudflare endpoint';
$out[] = '# Ref: https://issuetracker.revi.xyz/u/cloudflarecdncgi';
$out[] = 'Disallow: /cdn-cgi/';
$out[] = '# Google Ads are not welcome';
$out[] = 'User-agent: Mediapartners-Google';
$out[] = 'Disallow: /';
$out[] = 'User-agent: AdsBot-Google';
$out[] = 'Disallow: /';
$out[] = 'User-agent: AdsBot-Google-Mobile';
$out[] = 'Disallow: /';
// While I sometimes (borderline 'rare') use LLMs (GPT, Gemini, …), I'd rather prefer LLMs not use my stuff to profit
// Well I think my stuff is mostly out of interest for them, tho…
$out[] = '# ChatGPT Crawlers are not welcome';
$out[] = '# Ref: https://platform.openai.com/docs/plugins/bot';
$out[] = 'User-agent: ChatGPT-User';
$out[] = 'Disallow: /';
$out[] = 'User-agent: GPTBot';
$out[] = 'Disallow: /';
$out[] = '# Google Gemini AI Crawlers are also not welcome';
$out[] = '# Ref: https://issuetracker.revi.xyz/u/googleextended';
$out[] = 'User-agent: Google-Extended';
$out[] = 'Disallow: /';
$out[] = '# Apple AI stuff';
$out[] = '# Ref: https://support.apple.com/en-us/119829#datausage';
$out[] = 'User-agent: Applebot-Extended';
$out[] = 'Disallow: /';
$out[] = '# CCBot (ab)used to train LLMs';
$out[] = '# Ref: https://darkvisitors.com/agents/ccbot';
$out[] = 'User-agent: CCBot';
$out[] = 'Disallow: /';
$out[] = '# Facebook LLM Bot';
$out[] = '# Ref: https://developers.facebook.com/docs/sharing/bot/';
$out[] = 'User-agent: FacebookBot';
$out[] = 'Disallow: /';
$out[] =
'# DiffBot, though this one is known to have option to ignore robotstxt';
$out[] = '# Ref https://issuetracker.revi.xyz/u/robotstxtdiffbot';
$out[] = 'User-agent: Diffbot';
$out[] = 'Disallow: /';
$out[] = '# Bytespider';
$out[] = '# Ref: https://darkvisitors.com/agents/bytespider';
$out[] = 'User-agent: Bytespider';
$out[] = 'Disallow: /';
$out[] = '# See https://revi.xyz/robots.txt for rationales';
$out[] = 'User-agent: TurnitinBot';
$out[] = 'Disallow: /';
$out[] = 'User-agent: NPBot';
$out[] = 'Disallow: /';
$out[] = 'User-agent: SlySearch';
$out[] = 'Disallow: /';
$out[] = 'User-agent: BLEXBot';
$out[] = 'Disallow: /';
$out[] = '# Block CheckMarkNetwork';
$out[] =
'User-agent: CheckMarkNetwork/1.0 (+https://www.checkmarknetwork.com/spider.html)';
$out[] = 'Disallow: /';
$out[] = 'User-agent: BrandVerity/1.0';
$out[] = 'Disallow: /';
$out[] = '# Block peer39';
$out[] = 'User-agent: peer39_crawler';
$out[] = 'User-agent: peer39_crawler/1.0';
$out[] = 'Disallow: /';
$out[] = '# Block PetalBot, misbehaving';
$out[] = 'User-agent: PetalBot';
$out[] = 'Disallow: /';
$out[] = '# Block DotBot';
$out[] = 'User-agent: DotBot';
$out[] = 'Disallow: /';
$out[] = '# Block MegaIndex';
$out[] = 'User-agent: MegaIndex';
$out[] = 'Disallow: /';
$out[] = '# Block SerpstatBot';
$out[] = 'User-agent: serpstatbot';
$out[] = 'Disallow: /';
$out[] = '# Block SeekportBot';
$out[] = 'User-agent: SeekportBot';
$out[] = 'Disallow: /';
$out[] = '# Block SemRushBot';
$out[] = 'User-agent: SemrushBot';
$out[] = 'Disallow: /';
$out[] = '# Block AhrefsBot';
$out[] = 'User-agent: AhrefsBot';
$out[] = 'Disallow: /';
// Crawl-delay entries at the bottom
// Ref: https://github.com/otwcode/otwarchive/pull/4411#discussion_r1044351129
$out[] = '# Throttle MJ12bot';
$out[] = 'User-agent: MJ12bot';
$out[] = 'Crawl-delay: 10';
$out[] = '# Throttle YandexBot';
$out[] = 'User-agent: YandexBot';
$out[] = 'Crawl-delay: 5';
$out[] = '# Throttle BingBot';
$out[] = 'User-agent: bingbot';
$out[] = 'Crawl-delay: 5';
$out[] = '# Throttle all other bots';
$out[] = 'User-agent: *';
$out[] = 'Crawl-delay: 5';
$content = implode("\n", $out)."\n";
return id(new AphrontPlainTextResponse())
->setContent($content)
->setCacheDurationInSeconds(phutil_units('2 hours in seconds'))
// ->setClientIDCookie(false) (Doesn't work /shrug)
->setCanCDN(true)
->setLastModified(1720015620);
}
}