phorge(robots.txt): add more domains

Summary:
Almost complete copy of rWEBXYZ. cookie unset didn't work as hoped,
comment it out.

Signed-off-by: Yongmin Hong <revi@omglol.email>

Test Plan: deploy it.

Reviewers: O1 revi & automations, revi

Reviewed By: O1 revi & automations, revi

Differential Revision: https://issuetracker.revi.xyz/D370
This commit is contained in:
revi 2024-06-19 15:15:38 +09:00
parent ba384f8349
commit a1117fa7c1
Signed by: revi
GPG key ID: 1EB4F6CEEA100E94

View file

@ -17,7 +17,7 @@ public function processRequest() {
// Version timestamp is when I started editing them. // Version timestamp is when I started editing them.
// Edit setLastModified at the bottom as well. // Edit setLastModified at the bottom as well.
// Calculate EpochTime via go/epoch // Calculate EpochTime via go/epoch
$out[] = '# version: 20240616T191900+0900'; $out[] = '# version: 20240619T151433+0900';
$out[] = '# also at https://github.com/revi/sandbox.git'; $out[] = '# also at https://github.com/revi/sandbox.git';
$out[] = 'User-Agent: *'; $out[] = 'User-Agent: *';
$out[] = 'Disallow: /diffusion/'; $out[] = 'Disallow: /diffusion/';
@ -88,7 +88,8 @@ public function processRequest() {
$out[] = '# Ref: https://developers.facebook.com/docs/sharing/bot/'; $out[] = '# Ref: https://developers.facebook.com/docs/sharing/bot/';
$out[] = 'User-agent: FacebookBot'; $out[] = 'User-agent: FacebookBot';
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
$out[] = '# DiffBot, though this one is known to have option to ignore robotstxt'; $out[] =
'# DiffBot, though this one is known to have option to ignore robotstxt';
$out[] = '# Ref https://issuetracker.revi.xyz/u/robotstxtdiffbot'; $out[] = '# Ref https://issuetracker.revi.xyz/u/robotstxtdiffbot';
$out[] = 'User-agent: Diffbot'; $out[] = 'User-agent: Diffbot';
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
@ -96,19 +97,6 @@ public function processRequest() {
$out[] = '# Ref: https://darkvisitors.com/agents/bytespider'; $out[] = '# Ref: https://darkvisitors.com/agents/bytespider';
$out[] = 'User-agent: Bytespider'; $out[] = 'User-agent: Bytespider';
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
$out[] = '# Block PetalBot, misbehaving';
$out[] = 'User-agent: PetalBot';
$out[] = 'Disallow: /';
$out[] = '# Block peer39';
$out[] = 'User-agent: peer39_crawler';
$out[] = 'User-agent: peer39_crawler/1.0';
$out[] = 'Disallow: /';
$out[] = '# Block SemRushBot';
$out[] = 'User-agent: SemrushBot';
$out[] = 'Disallow: /';
$out[] = '# Block AhrefsBot';
$out[] = 'User-agent: AhrefsBot';
$out[] = 'Disallow: /';
$out[] = '# See https://revi.xyz/robots.txt for rationales'; $out[] = '# See https://revi.xyz/robots.txt for rationales';
$out[] = 'User-agent: TurnitinBot'; $out[] = 'User-agent: TurnitinBot';
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
@ -118,20 +106,59 @@ public function processRequest() {
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
$out[] = 'User-agent: BLEXBot'; $out[] = 'User-agent: BLEXBot';
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
$out[] = '# Block CheckMarkNetwork';
$out[] =
'User-agent: CheckMarkNetwork/1.0 (+https://www.checkmarknetwork.com/spider.html)';
$out[] = 'Disallow: /';
$out[] = 'User-agent: BrandVerity/1.0'; $out[] = 'User-agent: BrandVerity/1.0';
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
$out[] = '# Block peer39';
$out[] = 'User-agent: peer39_crawler';
$out[] = 'User-agent: peer39_crawler/1.0';
$out[] = 'Disallow: /';
$out[] = '# Block PetalBot, misbehaving';
$out[] = 'User-agent: PetalBot';
$out[] = 'Disallow: /';
$out[] = '# Block DotBot';
$out[] = 'User-agent: DotBot';
$out[] = 'Disallow: /';
$out[] = '# Block MegaIndex';
$out[] = 'User-agent: MegaIndex';
$out[] = 'Disallow: /';
$out[] = '# Block SerpstatBot';
$out[] = 'User-agent: serpstatbot';
$out[] = 'Disallow: /';
$out[] = '# Block SeekportBot';
$out[] = 'User-agent: SeekportBot';
$out[] = 'Disallow: /';
$out[] = '# Block SemRushBot';
$out[] = 'User-agent: SemrushBot';
$out[] = 'Disallow: /';
$out[] = '# Block AhrefsBot';
$out[] = 'User-agent: AhrefsBot';
$out[] = 'Disallow: /';
// Crawl-delay entries at the bottom // Crawl-delay entries at the bottom
// Ref: https://github.com/otwcode/otwarchive/pull/4411#discussion_r1044351129 // Ref: https://github.com/otwcode/otwarchive/pull/4411#discussion_r1044351129
$out[] = '# Throttle MJ12bot';
$out[] = 'User-agent: MJ12bot';
$out[] = 'Crawl-delay: 10';
$out[] = '# Throttle YandexBot';
$out[] = 'User-agent: YandexBot';
$out[] = 'Crawl-delay: 5';
$out[] = '# Throttle BingBot';
$out[] = 'User-agent: bingbot';
$out[] = 'Crawl-delay: 5';
$out[] = '# Throttle all other bots';
$out[] = 'User-agent: *'; $out[] = 'User-agent: *';
$out[] = 'Crawl-delay: 1'; $out[] = 'Crawl-delay: 5';
$content = implode("\n", $out)."\n"; $content = implode("\n", $out)."\n";
return id(new AphrontPlainTextResponse()) return id(new AphrontPlainTextResponse())
->setContent($content) ->setContent($content)
->setCacheDurationInSeconds(phutil_units('2 hours in seconds')) ->setCacheDurationInSeconds(phutil_units('2 hours in seconds'))
->setClientIDCookie(false) // ->setClientIDCookie(false) (Doesn't work /shrug)
->setCanCDN(true) ->setCanCDN(true)
->setLastModified(1718533140); ->setLastModified(1718777673);
} }
} }