sandbox/PhabExt/PhabricatorCustomRobotsTxtController.php

<?php

abstract class PhabricatorRobotsController extends PhabricatorController {

  public function shouldRequireLogin() {
    return false;
  }

  public function processRequest() {
    $out = array();

    $out[] = '# Forked from phabricator.wikimedia.org, we.phorge.it';
    // Version timestamp is when I started editing them.
    $out[] = '# version: 20240421T042000+0900';
    $out[] = '# also at https://github.com/revi/sandbox.git';
    $out[] = 'User-Agent: *';
    $out[] = 'Disallow: /diffusion/';
    $out[] = 'Disallow: /source/';
    $out[] = 'Disallow: /multimeter/';
    $out[] = 'Disallow: /file/';
    $out[] = 'Disallow: /policy/explain';
    $out[] = 'Disallow: /auth';
    $out[] = 'Disallow: /login';
    $out[] = 'Disallow: /maniphest/transaction';
    $out[] = 'Disallow: /tag';
    $out[] = 'Disallow: /search/query/all';
    $out[] = 'Disallow: /conduit';
    $out[] = 'Disallow: /api';
    $out[] = 'Disallow: /project';
    $out[] = 'Disallow: /applications';
    $out[] = 'Disallow: /token';
    $out[] = 'Disallow: /pholio';
    $out[] = 'Disallow: /dashboard';
    $out[] = 'Disallow: /calendar';
    $out[] = 'Disallow: /herald';
    // This is commits.
    $out[] = 'Disallow: /r*';
    // This is pastes (P$)
    $out[] = 'Disallow: /P*%24*';
    $out[] = 'Disallow: /phame';
    // This is blog entries (J$)
    $out[] = 'Disallow: /J*%24*';
    // This is user list.
    // As of 2024-04-17 user list is behind auth but who knows it might change?
    $out[] = 'Disallow: /people';
    // This is user profile link.
    $out[] = 'Disallow: /p/';
    // Phorge specific entries end here.
    $out[] = '# This is cloudflare endpoint';
    $out[] = '# Ref: https://developers.cloudflare.com/fundamentals/reference/cdn-cgi-endpoint/';
    $out[] = 'Disallow: /cdn-cgi/';
    $out[] = '# Google Ads are not welcome';
    $out[] = 'User-agent: Mediapartners-Google';
    $out[] = 'Disallow: /';
    $out[] = 'User-agent: AdsBot-Google';
    $out[] = 'Disallow: /';
    $out[] = 'User-agent: AdsBot-Google-Mobile';
    $out[] = 'Disallow: /';
    // While I sometimes (borderline 'rare') use LLMs (GPT, Gemini, …), I'd rather prefer LLMs not use my stuff to profit
    // Well I think my stuff is mostly out of interest for them, tho…
    $out[] = '# ChatGPT Crawlers are not welcome';
    $out[] = '# Ref: https://platform.openai.com/docs/plugins/bot';
    $out[] = 'User-agent: ChatGPT-User';
    $out[] = 'Disallow: /';
    $out[] = 'User-agent: GPTBot';
    $out[] = 'Disallow: /';
    $out[] = '# Google Gemini AI Crawlers are also not welcome';
    $out[] = '# Ref: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers?hl=en#google-extended';
    $out[] = 'User-agent: Google-Extended';
    $out[] = 'Disallow: /';
    $out[] = '# CCBot (ab)used to train LLMs';
    $out[] = '# Ref: https://darkvisitors.com/agents/ccbot';
    $out[] = 'User-agent: CCBot';
    $out[] = 'Disallow: /';
    $out[] = '# Facebook LLM Bot';
    $out[] = '# Ref: https://developers.facebook.com/docs/sharing/bot/';
    $out[] = 'User-agent: FacebookBot';
    $out[] = 'Disallow: /';
    $out[] = '# DiffBot, though this one is known to have option to ignore robotstxt';
    $out[] = '# Ref https://docs.diffbot.com/docs/why-is-my-crawl-not-crawling-and-other-uncommon-crawl-problems';
    $out[] = 'User-agent: Diffbot';
    $out[] = 'Disallow: /';
    $out[] = '# Bytespider';
    $out[] = '# Ref: https://darkvisitors.com/agents/bytespider';
    $out[] = 'User-agent: Bytespider';
    $out[] = 'Disallow: /';
    // Crawl-delay entries at the bottom
    // Ref: https://github.com/otwcode/otwarchive/pull/4411#discussion_r1044351129
    $out[] = 'User-agent: *';
    $out[] = 'Crawl-delay: 1';

    $content = implode("\n", $out)."\n";

    return id(new AphrontPlainTextResponse())
      ->setContent($content)
      ->setCacheDurationInSeconds(phutil_units('2 hours in seconds'))
      ->setCanCDN(true);
  }
}
initial commit Yet another sandbox. Bug: N/A Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-15 19:14:13 +09:00			`<?php`

phab-ext: test Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-16 21:04:16 +09:00			`abstract class PhabricatorRobotsController extends PhabricatorController {`
initial commit Yet another sandbox. Bug: N/A Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-15 19:14:13 +09:00
			`public function shouldRequireLogin() {`
			`return false;`
			`}`

			`public function processRequest() {`
			`$out = array();`

RobotsTxt: fix comment error Whoopsie. Bug: Ref T43 Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-16 21:17:47 +09:00			`$out[] = '# Forked from phabricator.wikimedia.org, we.phorge.it';`
RobotsTxt: add more rules Interest of nobody. Bug: N/A Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-17 01:35:10 +09:00			`// Version timestamp is when I started editing them.`
PhabExt: add more domains More LLM domains Source: https://darkvisitors.com/docs/set-up-a-robots-txt Bug: N/A Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-21 04:36:07 +09:00			`$out[] = '# version: 20240421T042000+0900';`
RobotsTxt: fix comment error Whoopsie. Bug: Ref T43 Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-16 21:17:47 +09:00			`$out[] = '# also at https://github.com/revi/sandbox.git';`
initial commit Yet another sandbox. Bug: N/A Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-15 19:14:13 +09:00			`$out[] = 'User-Agent: *';`
phab-ext: test Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-16 21:04:16 +09:00			`$out[] = 'Disallow: /diffusion/';`
			`$out[] = 'Disallow: /source/';`
			`$out[] = 'Disallow: /multimeter/';`
			`$out[] = 'Disallow: /file/';`
			`$out[] = 'Disallow: /policy/explain';`
			`$out[] = 'Disallow: /auth';`
			`$out[] = 'Disallow: /login';`
			`$out[] = 'Disallow: /maniphest/transaction';`
			`$out[] = 'Disallow: /tag';`
			`$out[] = 'Disallow: /search/query/all';`
			`$out[] = 'Disallow: /conduit';`
			`$out[] = 'Disallow: /api';`
			`$out[] = 'Disallow: /project';`
			`$out[] = 'Disallow: /applications';`
			`$out[] = 'Disallow: /token';`
			`$out[] = 'Disallow: /pholio';`
			`$out[] = 'Disallow: /dashboard';`
			`$out[] = 'Disallow: /calendar';`
			`$out[] = 'Disallow: /herald';`
RobotsTxt: add more rules Interest of nobody. Bug: N/A Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-17 01:35:10 +09:00			`// This is commits.`
phab-ext: test Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-16 21:04:16 +09:00			`$out[] = 'Disallow: /r*';`
RobotsTxt: add more rules Interest of nobody. Bug: N/A Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-17 01:35:10 +09:00			`// This is pastes (P$)`
phab-ext: test Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-16 21:04:16 +09:00			`$out[] = 'Disallow: /P%24';`
RobotsTxt: add more rules Interest of nobody. Bug: N/A Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-17 01:35:10 +09:00			`$out[] = 'Disallow: /phame';`
			`// This is blog entries (J$)`
RobotsTxt: fix typo And remove redundant rule. Bug: N/A Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-17 01:42:05 +09:00			`$out[] = 'Disallow: /J%24';`
RobotsTxt: add more rules Interest of nobody. Bug: N/A Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-17 01:35:10 +09:00			`// This is user list.`
			`// As of 2024-04-17 user list is behind auth but who knows it might change?`
RobotsTxt: fix another typo It's probably high time that I go to bed. Bug: N/A Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-17 01:50:42 +09:00			`$out[] = 'Disallow: /people';`
RobotsTxt: add more rules Interest of nobody. Bug: N/A Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-17 01:35:10 +09:00			`// This is user profile link.`
			`$out[] = 'Disallow: /p/';`
			`// Phorge specific entries end here.`
phab-ext: test Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-16 21:04:16 +09:00			`$out[] = '# This is cloudflare endpoint';`
RobotsTxt: fix comment error Whoopsie. Bug: Ref T43 Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-16 21:17:47 +09:00			`$out[] = '# Ref: https://developers.cloudflare.com/fundamentals/reference/cdn-cgi-endpoint/';`
phab-ext: test Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-16 21:04:16 +09:00			`$out[] = 'Disallow: /cdn-cgi/';`
			`$out[] = '# Google Ads are not welcome';`
			`$out[] = 'User-agent: Mediapartners-Google';`
			`$out[] = 'Disallow: /';`
			`$out[] = 'User-agent: AdsBot-Google';`
			`$out[] = 'Disallow: /';`
			`$out[] = 'User-agent: AdsBot-Google-Mobile';`
			`$out[] = 'Disallow: /';`
PhabExt: add more domains More LLM domains Source: https://darkvisitors.com/docs/set-up-a-robots-txt Bug: N/A Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-21 04:36:07 +09:00			`// While I sometimes (borderline 'rare') use LLMs (GPT, Gemini, …), I'd rather prefer LLMs not use my stuff to profit`
			`// Well I think my stuff is mostly out of interest for them, tho…`
phab-ext: test Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-16 21:04:16 +09:00			`$out[] = '# ChatGPT Crawlers are not welcome';`
			`$out[] = '# Ref: https://platform.openai.com/docs/plugins/bot';`
			`$out[] = 'User-agent: ChatGPT-User';`
			`$out[] = 'Disallow: /';`
			`$out[] = 'User-agent: GPTBot';`
			`$out[] = 'Disallow: /';`
			`$out[] = '# Google Gemini AI Crawlers are also not welcome';`
			`$out[] = '# Ref: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers?hl=en#google-extended';`
			`$out[] = 'User-agent: Google-Extended';`
			`$out[] = 'Disallow: /';`
PhabExt: add more domains More LLM domains Source: https://darkvisitors.com/docs/set-up-a-robots-txt Bug: N/A Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-21 04:36:07 +09:00			`$out[] = '# CCBot (ab)used to train LLMs';`
			`$out[] = '# Ref: https://darkvisitors.com/agents/ccbot';`
			`$out[] = 'User-agent: CCBot';`
			`$out[] = 'Disallow: /';`
			`$out[] = '# Facebook LLM Bot';`
			`$out[] = '# Ref: https://developers.facebook.com/docs/sharing/bot/';`
			`$out[] = 'User-agent: FacebookBot';`
			`$out[] = 'Disallow: /';`
			`$out[] = '# DiffBot, though this one is known to have option to ignore robotstxt';`
			`$out[] = '# Ref https://docs.diffbot.com/docs/why-is-my-crawl-not-crawling-and-other-uncommon-crawl-problems';`
			`$out[] = 'User-agent: Diffbot';`
			`$out[] = 'Disallow: /';`
			`$out[] = '# Bytespider';`
			`$out[] = '# Ref: https://darkvisitors.com/agents/bytespider';`
			`$out[] = 'User-agent: Bytespider';`
			`$out[] = 'Disallow: /';`
RobotsTxt: add more rules Interest of nobody. Bug: N/A Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-17 01:35:10 +09:00			`// Crawl-delay entries at the bottom`
			`// Ref: https://github.com/otwcode/otwarchive/pull/4411#discussion_r1044351129`
phab-ext: test Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-16 21:04:16 +09:00			`$out[] = 'User-agent: *';`
initial commit Yet another sandbox. Bug: N/A Signed-off-by: Yongmin Hong <revi@omglol.email> 2024-04-15 19:14:13 +09:00			`$out[] = 'Crawl-delay: 1';`

			`$content = implode("\n", $out)."\n";`

			`return id(new AphrontPlainTextResponse())`
			`->setContent($content)`
			`->setCacheDurationInSeconds(phutil_units('2 hours in seconds'))`
			`->setCanCDN(true);`
			`}`
			`}`