phorge(robots.txt): add more domains
Summary: Almost complete copy of rWEBXYZ. cookie unset didn't work as hoped, comment it out. Signed-off-by: Yongmin Hong <revi@omglol.email> Test Plan: deploy it. Reviewers: O1 revi & automations, revi Reviewed By: O1 revi & automations, revi Differential Revision: https://issuetracker.revi.xyz/D370
This commit is contained in:
parent
ba384f8349
commit
a1117fa7c1
1 changed files with 45 additions and 18 deletions
|
@ -17,7 +17,7 @@ public function processRequest() {
|
|||
// Version timestamp is when I started editing them.
|
||||
// Edit setLastModified at the bottom as well.
|
||||
// Calculate EpochTime via go/epoch
|
||||
$out[] = '# version: 20240616T191900+0900';
|
||||
$out[] = '# version: 20240619T151433+0900';
|
||||
$out[] = '# also at https://github.com/revi/sandbox.git';
|
||||
$out[] = 'User-Agent: *';
|
||||
$out[] = 'Disallow: /diffusion/';
|
||||
|
@ -88,7 +88,8 @@ public function processRequest() {
|
|||
$out[] = '# Ref: https://developers.facebook.com/docs/sharing/bot/';
|
||||
$out[] = 'User-agent: FacebookBot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# DiffBot, though this one is known to have option to ignore robotstxt';
|
||||
$out[] =
|
||||
'# DiffBot, though this one is known to have option to ignore robotstxt';
|
||||
$out[] = '# Ref https://issuetracker.revi.xyz/u/robotstxtdiffbot';
|
||||
$out[] = 'User-agent: Diffbot';
|
||||
$out[] = 'Disallow: /';
|
||||
|
@ -96,19 +97,6 @@ public function processRequest() {
|
|||
$out[] = '# Ref: https://darkvisitors.com/agents/bytespider';
|
||||
$out[] = 'User-agent: Bytespider';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Block PetalBot, misbehaving';
|
||||
$out[] = 'User-agent: PetalBot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Block peer39';
|
||||
$out[] = 'User-agent: peer39_crawler';
|
||||
$out[] = 'User-agent: peer39_crawler/1.0';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Block SemRushBot';
|
||||
$out[] = 'User-agent: SemrushBot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Block AhrefsBot';
|
||||
$out[] = 'User-agent: AhrefsBot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# See https://revi.xyz/robots.txt for rationales';
|
||||
$out[] = 'User-agent: TurnitinBot';
|
||||
$out[] = 'Disallow: /';
|
||||
|
@ -118,20 +106,59 @@ public function processRequest() {
|
|||
$out[] = 'Disallow: /';
|
||||
$out[] = 'User-agent: BLEXBot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Block CheckMarkNetwork';
|
||||
$out[] =
|
||||
'User-agent: CheckMarkNetwork/1.0 (+https://www.checkmarknetwork.com/spider.html)';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = 'User-agent: BrandVerity/1.0';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Block peer39';
|
||||
$out[] = 'User-agent: peer39_crawler';
|
||||
$out[] = 'User-agent: peer39_crawler/1.0';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Block PetalBot, misbehaving';
|
||||
$out[] = 'User-agent: PetalBot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Block DotBot';
|
||||
$out[] = 'User-agent: DotBot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Block MegaIndex';
|
||||
$out[] = 'User-agent: MegaIndex';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Block SerpstatBot';
|
||||
$out[] = 'User-agent: serpstatbot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Block SeekportBot';
|
||||
$out[] = 'User-agent: SeekportBot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Block SemRushBot';
|
||||
$out[] = 'User-agent: SemrushBot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Block AhrefsBot';
|
||||
$out[] = 'User-agent: AhrefsBot';
|
||||
$out[] = 'Disallow: /';
|
||||
// Crawl-delay entries at the bottom
|
||||
// Ref: https://github.com/otwcode/otwarchive/pull/4411#discussion_r1044351129
|
||||
$out[] = '# Throttle MJ12bot';
|
||||
$out[] = 'User-agent: MJ12bot';
|
||||
$out[] = 'Crawl-delay: 10';
|
||||
$out[] = '# Throttle YandexBot';
|
||||
$out[] = 'User-agent: YandexBot';
|
||||
$out[] = 'Crawl-delay: 5';
|
||||
$out[] = '# Throttle BingBot';
|
||||
$out[] = 'User-agent: bingbot';
|
||||
$out[] = 'Crawl-delay: 5';
|
||||
$out[] = '# Throttle all other bots';
|
||||
$out[] = 'User-agent: *';
|
||||
$out[] = 'Crawl-delay: 1';
|
||||
$out[] = 'Crawl-delay: 5';
|
||||
|
||||
$content = implode("\n", $out)."\n";
|
||||
|
||||
return id(new AphrontPlainTextResponse())
|
||||
->setContent($content)
|
||||
->setCacheDurationInSeconds(phutil_units('2 hours in seconds'))
|
||||
->setClientIDCookie(false)
|
||||
// ->setClientIDCookie(false) (Doesn't work /shrug)
|
||||
->setCanCDN(true)
|
||||
->setLastModified(1718533140);
|
||||
->setLastModified(1718777673);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue