From 467eb9b2d13a5d6ad44ad2b936a30416d3e5cdf2 Mon Sep 17 00:00:00 2001 From: Yongmin Hong Date: Sat, 11 May 2024 17:28:38 +0900 Subject: [PATCH] robotstxt: just use * Summary: Fixes T138. Seems like Google doesn't care. Signed-off-by: Yongmin Hong Test Plan: Try http://go/google/search w/ updated robots.txt entries Reviewers: O1 revi & automations, revi Reviewed By: O1 revi & automations, revi Maniphest Tasks: T138 Differential Revision: https://issuetracker.revi.xyz/D75 --- .editorconfig | 32 +-- .../PhabricatorCustomRobotsTxtController.php | 194 +++++++++--------- 2 files changed, 103 insertions(+), 123 deletions(-) diff --git a/.editorconfig b/.editorconfig index 72dadd9..dd8c242 100644 --- a/.editorconfig +++ b/.editorconfig @@ -5,32 +5,12 @@ root = true [*] charset = utf-8 end_of_line = lf -indent_size = tab -indent_style = tab +indent_size = 2 +indent_style = space insert_final_newline = true -tab_width = 4 trim_trailing_whitespace = true -[*.md] -indent_style = space -indent_size = 2 - -[.arc*] -indent_size = 2 -indent_style = space - -# Tabs may not be valid YAML -# @see https://yaml.org/spec/1.2/spec.html#id2777534 -[*.{css,js,json,yml,yaml}] -indent_size = 2 -indent_style = space - -# yaml-lint configuration -# YAML files should be indented with spaces, which .yamllint is -[.yamllint] -indent_size = 2 -indent_style = space - -[.git/**] -indent_size = 2 -indent_style = space +[.php] +# Use tabs for php file indentation +indent_style = tab +tab_width = 4 diff --git a/PhabExt/PhabricatorCustomRobotsTxtController.php b/PhabExt/PhabricatorCustomRobotsTxtController.php index 8c1b6f7..a72ae75 100644 --- a/PhabExt/PhabricatorCustomRobotsTxtController.php +++ b/PhabExt/PhabricatorCustomRobotsTxtController.php @@ -6,109 +6,109 @@ abstract class PhabricatorRobotsController extends PhabricatorController { public function shouldRequireLogin() { return false; - } + } public function setClientIDCookie() { return false; - } + } - public function processRequest() { - $out = array(); + public function processRequest() { + $out = array(); - $out[] = '# Adapted from phabricator.wikimedia.org, we.phorge.it'; - // Version timestamp is when I started editing them. - // Edit setLastModified at the bottom as well. - // Calculate EpochTime via go/epoch - $out[] = '# version: 20240509T235513+0900'; - $out[] = '# also at https://github.com/revi/sandbox.git'; - $out[] = 'User-Agent: *'; - $out[] = 'Disallow: /diffusion/'; - $out[] = 'Disallow: /source/'; - $out[] = 'Disallow: /multimeter/'; - $out[] = 'Disallow: /policy/explain'; - $out[] = 'Disallow: /auth'; - $out[] = 'Disallow: /login'; - $out[] = 'Disallow: /maniphest/transaction'; - $out[] = 'Disallow: /tag'; - $out[] = 'Disallow: /search/query/all'; - $out[] = 'Disallow: /conduit'; - $out[] = 'Disallow: /api'; - $out[] = 'Disallow: /project'; - $out[] = 'Disallow: /applications'; - $out[] = 'Disallow: /token'; - $out[] = 'Disallow: /pholio'; - $out[] = 'Disallow: /dashboard'; - $out[] = 'Disallow: /calendar'; - $out[] = 'Disallow: /herald'; - // This is commits. - $out[] = 'Disallow: /r*'; - // This is differential reviews - $out[] = 'Disallow: /differential'; - $out[] = 'Disallow: /D*%24*'; - // This is Files. (F$) - $out[] = 'Disallow: /file'; - $out[] = 'Disallow: /F*%24*'; - // This is pastes (P$) - $out[] = 'Disallow: /paste'; - $out[] = 'Disallow: /P*%24*'; - // This is blog entries (J$) - $out[] = 'Disallow: /phame'; - $out[] = 'Disallow: /J*%24*'; - // This is user list. - // As of 2024-04-17 user list is behind auth but who knows it might change? - $out[] = 'Disallow: /people'; - // This is user profile link. - $out[] = 'Disallow: /p/'; - // Phorge specific entries end here. - $out[] = '# This is cloudflare endpoint'; - $out[] = '# Ref: https://issuetracker.revi.xyz/u/cloudflarecdncgi'; - $out[] = 'Disallow: /cdn-cgi/'; - $out[] = '# Google Ads are not welcome'; - $out[] = 'User-agent: Mediapartners-Google'; - $out[] = 'Disallow: /'; - $out[] = 'User-agent: AdsBot-Google'; - $out[] = 'Disallow: /'; - $out[] = 'User-agent: AdsBot-Google-Mobile'; - $out[] = 'Disallow: /'; - // While I sometimes (borderline 'rare') use LLMs (GPT, Gemini, …), I'd rather prefer LLMs not use my stuff to profit - // Well I think my stuff is mostly out of interest for them, tho… - $out[] = '# ChatGPT Crawlers are not welcome'; - $out[] = '# Ref: https://platform.openai.com/docs/plugins/bot'; - $out[] = 'User-agent: ChatGPT-User'; - $out[] = 'Disallow: /'; - $out[] = 'User-agent: GPTBot'; - $out[] = 'Disallow: /'; - $out[] = '# Google Gemini AI Crawlers are also not welcome'; - $out[] = '# Ref: https://issuetracker.revi.xyz/u/googleextended'; - $out[] = 'User-agent: Google-Extended'; - $out[] = 'Disallow: /'; - $out[] = '# CCBot (ab)used to train LLMs'; - $out[] = '# Ref: https://darkvisitors.com/agents/ccbot'; - $out[] = 'User-agent: CCBot'; - $out[] = 'Disallow: /'; - $out[] = '# Facebook LLM Bot'; - $out[] = '# Ref: https://developers.facebook.com/docs/sharing/bot/'; - $out[] = 'User-agent: FacebookBot'; - $out[] = 'Disallow: /'; - $out[] = '# DiffBot, though this one is known to have option to ignore robotstxt'; - $out[] = '# Ref https://issuetracker.revi.xyz/u/robotstxtdiffbot'; - $out[] = 'User-agent: Diffbot'; - $out[] = 'Disallow: /'; - $out[] = '# Bytespider'; - $out[] = '# Ref: https://darkvisitors.com/agents/bytespider'; - $out[] = 'User-agent: Bytespider'; - $out[] = 'Disallow: /'; - // Crawl-delay entries at the bottom - // Ref: https://github.com/otwcode/otwarchive/pull/4411#discussion_r1044351129 - $out[] = 'User-agent: *'; - $out[] = 'Crawl-delay: 1'; + $out[] = '# Adapted from phabricator.wikimedia.org, we.phorge.it'; + // Version timestamp is when I started editing them. + // Edit setLastModified at the bottom as well. + // Calculate EpochTime via go/epoch + $out[] = '# version: 20240511T052727+0900'; + $out[] = '# also at https://github.com/revi/sandbox.git'; + $out[] = 'User-Agent: *'; + $out[] = 'Disallow: /diffusion/'; + $out[] = 'Disallow: /source/'; + $out[] = 'Disallow: /multimeter/'; + $out[] = 'Disallow: /policy/explain'; + $out[] = 'Disallow: /auth'; + $out[] = 'Disallow: /login'; + $out[] = 'Disallow: /maniphest/transaction'; + $out[] = 'Disallow: /tag'; + $out[] = 'Disallow: /search/query/all'; + $out[] = 'Disallow: /conduit'; + $out[] = 'Disallow: /api'; + $out[] = 'Disallow: /project'; + $out[] = 'Disallow: /applications'; + $out[] = 'Disallow: /token'; + $out[] = 'Disallow: /pholio'; + $out[] = 'Disallow: /dashboard'; + $out[] = 'Disallow: /calendar'; + $out[] = 'Disallow: /herald'; + // This is commits. + $out[] = 'Disallow: /r*'; + // This is differential revisions. (D*) + $out[] = 'Disallow: /differential'; + $out[] = 'Disallow: /D*'; + // This is Files. (F*) + $out[] = 'Disallow: /file'; + $out[] = 'Disallow: /F*'; + // This is pastes (P*) + $out[] = 'Disallow: /paste'; + $out[] = 'Disallow: /P*'; + // This is blog entries (J$) + $out[] = 'Disallow: /phame'; + $out[] = 'Disallow: /J*'; + // This is user list. + // As of 2024-04-17 user list is behind auth but who knows it might change? + $out[] = 'Disallow: /people'; + // This is user profile link. + $out[] = 'Disallow: /p/'; + // Phorge specific entries end here. + $out[] = '# This is cloudflare endpoint'; + $out[] = '# Ref: https://issuetracker.revi.xyz/u/cloudflarecdncgi'; + $out[] = 'Disallow: /cdn-cgi/'; + $out[] = '# Google Ads are not welcome'; + $out[] = 'User-agent: Mediapartners-Google'; + $out[] = 'Disallow: /'; + $out[] = 'User-agent: AdsBot-Google'; + $out[] = 'Disallow: /'; + $out[] = 'User-agent: AdsBot-Google-Mobile'; + $out[] = 'Disallow: /'; + // While I sometimes (borderline 'rare') use LLMs (GPT, Gemini, …), I'd rather prefer LLMs not use my stuff to profit + // Well I think my stuff is mostly out of interest for them, tho… + $out[] = '# ChatGPT Crawlers are not welcome'; + $out[] = '# Ref: https://platform.openai.com/docs/plugins/bot'; + $out[] = 'User-agent: ChatGPT-User'; + $out[] = 'Disallow: /'; + $out[] = 'User-agent: GPTBot'; + $out[] = 'Disallow: /'; + $out[] = '# Google Gemini AI Crawlers are also not welcome'; + $out[] = '# Ref: https://issuetracker.revi.xyz/u/googleextended'; + $out[] = 'User-agent: Google-Extended'; + $out[] = 'Disallow: /'; + $out[] = '# CCBot (ab)used to train LLMs'; + $out[] = '# Ref: https://darkvisitors.com/agents/ccbot'; + $out[] = 'User-agent: CCBot'; + $out[] = 'Disallow: /'; + $out[] = '# Facebook LLM Bot'; + $out[] = '# Ref: https://developers.facebook.com/docs/sharing/bot/'; + $out[] = 'User-agent: FacebookBot'; + $out[] = 'Disallow: /'; + $out[] = '# DiffBot, though this one is known to have option to ignore robotstxt'; + $out[] = '# Ref https://issuetracker.revi.xyz/u/robotstxtdiffbot'; + $out[] = 'User-agent: Diffbot'; + $out[] = 'Disallow: /'; + $out[] = '# Bytespider'; + $out[] = '# Ref: https://darkvisitors.com/agents/bytespider'; + $out[] = 'User-agent: Bytespider'; + $out[] = 'Disallow: /'; + // Crawl-delay entries at the bottom + // Ref: https://github.com/otwcode/otwarchive/pull/4411#discussion_r1044351129 + $out[] = 'User-agent: *'; + $out[] = 'Crawl-delay: 1'; - $content = implode("\n", $out)."\n"; + $content = implode("\n", $out)."\n"; - return id(new AphrontPlainTextResponse()) - ->setContent($content) - ->setCacheDurationInSeconds(phutil_units('2 hours in seconds')) - ->setCanCDN(true) - ->setLastModified(1715266513); + return id(new AphrontPlainTextResponse()) + ->setContent($content) + ->setCacheDurationInSeconds(phutil_units('2 hours in seconds')) + ->setCanCDN(true) + ->setLastModified(1715372847); } }