From 59fb26f299c5075114ebc780dd856137b3b80fc7 Mon Sep 17 00:00:00 2001 From: Yongmin Hong Date: Sun, 21 Apr 2024 04:36:07 +0900 Subject: [PATCH] PhabExt: add more domains More LLM domains Source: https://darkvisitors.com/docs/set-up-a-robots-txt Bug: N/A Signed-off-by: Yongmin Hong --- .../PhabricatorCustomRobotsTxtController.php | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/PhabExt/PhabricatorCustomRobotsTxtController.php b/PhabExt/PhabricatorCustomRobotsTxtController.php index 11fc45c..3244056 100644 --- a/PhabExt/PhabricatorCustomRobotsTxtController.php +++ b/PhabExt/PhabricatorCustomRobotsTxtController.php @@ -11,7 +11,7 @@ public function processRequest() { $out[] = '# Forked from phabricator.wikimedia.org, we.phorge.it'; // Version timestamp is when I started editing them. - $out[] = '# version: 20240417T011800+0900'; + $out[] = '# version: 20240421T042000+0900'; $out[] = '# also at https://github.com/revi/sandbox.git'; $out[] = 'User-Agent: *'; $out[] = 'Disallow: /diffusion/'; @@ -56,6 +56,8 @@ public function processRequest() { $out[] = 'Disallow: /'; $out[] = 'User-agent: AdsBot-Google-Mobile'; $out[] = 'Disallow: /'; + // While I sometimes (borderline 'rare') use LLMs (GPT, Gemini, …), I'd rather prefer LLMs not use my stuff to profit + // Well I think my stuff is mostly out of interest for them, tho… $out[] = '# ChatGPT Crawlers are not welcome'; $out[] = '# Ref: https://platform.openai.com/docs/plugins/bot'; $out[] = 'User-agent: ChatGPT-User'; @@ -66,6 +68,22 @@ public function processRequest() { $out[] = '# Ref: https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers?hl=en#google-extended'; $out[] = 'User-agent: Google-Extended'; $out[] = 'Disallow: /'; + $out[] = '# CCBot (ab)used to train LLMs'; + $out[] = '# Ref: https://darkvisitors.com/agents/ccbot'; + $out[] = 'User-agent: CCBot'; + $out[] = 'Disallow: /'; + $out[] = '# Facebook LLM Bot'; + $out[] = '# Ref: https://developers.facebook.com/docs/sharing/bot/'; + $out[] = 'User-agent: FacebookBot'; + $out[] = 'Disallow: /'; + $out[] = '# DiffBot, though this one is known to have option to ignore robotstxt'; + $out[] = '# Ref https://docs.diffbot.com/docs/why-is-my-crawl-not-crawling-and-other-uncommon-crawl-problems'; + $out[] = 'User-agent: Diffbot'; + $out[] = 'Disallow: /'; + $out[] = '# Bytespider'; + $out[] = '# Ref: https://darkvisitors.com/agents/bytespider'; + $out[] = 'User-agent: Bytespider'; + $out[] = 'Disallow: /'; // Crawl-delay entries at the bottom // Ref: https://github.com/otwcode/otwarchive/pull/4411#discussion_r1044351129 $out[] = 'User-agent: *';