robotstxt: just use *

Summary:
Fixes T138. Seems like Google doesn't care.

Signed-off-by: Yongmin Hong <revi@omglol.email>

Test Plan: Try http://go/google/search w/ updated robots.txt entries

Reviewers: O1 revi & automations, revi

Reviewed By: O1 revi & automations, revi

Maniphest Tasks: T138

Differential Revision: https://issuetracker.revi.xyz/D75
This commit is contained in:
revi 2024-05-11 17:28:38 +09:00
parent a257d3ccb4
commit 467eb9b2d1
Signed by: revi
GPG key ID: 1EB4F6CEEA100E94
2 changed files with 103 additions and 123 deletions

View file

@ -5,32 +5,12 @@ root = true
[*] [*]
charset = utf-8 charset = utf-8
end_of_line = lf end_of_line = lf
indent_size = tab indent_size = 2
indent_style = tab indent_style = space
insert_final_newline = true insert_final_newline = true
tab_width = 4
trim_trailing_whitespace = true trim_trailing_whitespace = true
[*.md] [.php]
indent_style = space # Use tabs for php file indentation
indent_size = 2 indent_style = tab
tab_width = 4
[.arc*]
indent_size = 2
indent_style = space
# Tabs may not be valid YAML
# @see https://yaml.org/spec/1.2/spec.html#id2777534
[*.{css,js,json,yml,yaml}]
indent_size = 2
indent_style = space
# yaml-lint configuration
# YAML files should be indented with spaces, which .yamllint is
[.yamllint]
indent_size = 2
indent_style = space
[.git/**]
indent_size = 2
indent_style = space

View file

@ -6,109 +6,109 @@ abstract class PhabricatorRobotsController extends PhabricatorController {
public function shouldRequireLogin() { public function shouldRequireLogin() {
return false; return false;
} }
public function setClientIDCookie() { public function setClientIDCookie() {
return false; return false;
} }
public function processRequest() { public function processRequest() {
$out = array(); $out = array();
$out[] = '# Adapted from phabricator.wikimedia.org, we.phorge.it'; $out[] = '# Adapted from phabricator.wikimedia.org, we.phorge.it';
// Version timestamp is when I started editing them. // Version timestamp is when I started editing them.
// Edit setLastModified at the bottom as well. // Edit setLastModified at the bottom as well.
// Calculate EpochTime via go/epoch // Calculate EpochTime via go/epoch
$out[] = '# version: 20240509T235513+0900'; $out[] = '# version: 20240511T052727+0900';
$out[] = '# also at https://github.com/revi/sandbox.git'; $out[] = '# also at https://github.com/revi/sandbox.git';
$out[] = 'User-Agent: *'; $out[] = 'User-Agent: *';
$out[] = 'Disallow: /diffusion/'; $out[] = 'Disallow: /diffusion/';
$out[] = 'Disallow: /source/'; $out[] = 'Disallow: /source/';
$out[] = 'Disallow: /multimeter/'; $out[] = 'Disallow: /multimeter/';
$out[] = 'Disallow: /policy/explain'; $out[] = 'Disallow: /policy/explain';
$out[] = 'Disallow: /auth'; $out[] = 'Disallow: /auth';
$out[] = 'Disallow: /login'; $out[] = 'Disallow: /login';
$out[] = 'Disallow: /maniphest/transaction'; $out[] = 'Disallow: /maniphest/transaction';
$out[] = 'Disallow: /tag'; $out[] = 'Disallow: /tag';
$out[] = 'Disallow: /search/query/all'; $out[] = 'Disallow: /search/query/all';
$out[] = 'Disallow: /conduit'; $out[] = 'Disallow: /conduit';
$out[] = 'Disallow: /api'; $out[] = 'Disallow: /api';
$out[] = 'Disallow: /project'; $out[] = 'Disallow: /project';
$out[] = 'Disallow: /applications'; $out[] = 'Disallow: /applications';
$out[] = 'Disallow: /token'; $out[] = 'Disallow: /token';
$out[] = 'Disallow: /pholio'; $out[] = 'Disallow: /pholio';
$out[] = 'Disallow: /dashboard'; $out[] = 'Disallow: /dashboard';
$out[] = 'Disallow: /calendar'; $out[] = 'Disallow: /calendar';
$out[] = 'Disallow: /herald'; $out[] = 'Disallow: /herald';
// This is commits. // This is commits.
$out[] = 'Disallow: /r*'; $out[] = 'Disallow: /r*';
// This is differential reviews // This is differential revisions. (D*)
$out[] = 'Disallow: /differential'; $out[] = 'Disallow: /differential';
$out[] = 'Disallow: /D*%24*'; $out[] = 'Disallow: /D*';
// This is Files. (F$) // This is Files. (F*)
$out[] = 'Disallow: /file'; $out[] = 'Disallow: /file';
$out[] = 'Disallow: /F*%24*'; $out[] = 'Disallow: /F*';
// This is pastes (P$) // This is pastes (P*)
$out[] = 'Disallow: /paste'; $out[] = 'Disallow: /paste';
$out[] = 'Disallow: /P*%24*'; $out[] = 'Disallow: /P*';
// This is blog entries (J$) // This is blog entries (J$)
$out[] = 'Disallow: /phame'; $out[] = 'Disallow: /phame';
$out[] = 'Disallow: /J*%24*'; $out[] = 'Disallow: /J*';
// This is user list. // This is user list.
// As of 2024-04-17 user list is behind auth but who knows it might change? // As of 2024-04-17 user list is behind auth but who knows it might change?
$out[] = 'Disallow: /people'; $out[] = 'Disallow: /people';
// This is user profile link. // This is user profile link.
$out[] = 'Disallow: /p/'; $out[] = 'Disallow: /p/';
// Phorge specific entries end here. // Phorge specific entries end here.
$out[] = '# This is cloudflare endpoint'; $out[] = '# This is cloudflare endpoint';
$out[] = '# Ref: https://issuetracker.revi.xyz/u/cloudflarecdncgi'; $out[] = '# Ref: https://issuetracker.revi.xyz/u/cloudflarecdncgi';
$out[] = 'Disallow: /cdn-cgi/'; $out[] = 'Disallow: /cdn-cgi/';
$out[] = '# Google Ads are not welcome'; $out[] = '# Google Ads are not welcome';
$out[] = 'User-agent: Mediapartners-Google'; $out[] = 'User-agent: Mediapartners-Google';
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
$out[] = 'User-agent: AdsBot-Google'; $out[] = 'User-agent: AdsBot-Google';
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
$out[] = 'User-agent: AdsBot-Google-Mobile'; $out[] = 'User-agent: AdsBot-Google-Mobile';
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
// While I sometimes (borderline 'rare') use LLMs (GPT, Gemini, …), I'd rather prefer LLMs not use my stuff to profit // While I sometimes (borderline 'rare') use LLMs (GPT, Gemini, …), I'd rather prefer LLMs not use my stuff to profit
// Well I think my stuff is mostly out of interest for them, tho… // Well I think my stuff is mostly out of interest for them, tho…
$out[] = '# ChatGPT Crawlers are not welcome'; $out[] = '# ChatGPT Crawlers are not welcome';
$out[] = '# Ref: https://platform.openai.com/docs/plugins/bot'; $out[] = '# Ref: https://platform.openai.com/docs/plugins/bot';
$out[] = 'User-agent: ChatGPT-User'; $out[] = 'User-agent: ChatGPT-User';
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
$out[] = 'User-agent: GPTBot'; $out[] = 'User-agent: GPTBot';
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
$out[] = '# Google Gemini AI Crawlers are also not welcome'; $out[] = '# Google Gemini AI Crawlers are also not welcome';
$out[] = '# Ref: https://issuetracker.revi.xyz/u/googleextended'; $out[] = '# Ref: https://issuetracker.revi.xyz/u/googleextended';
$out[] = 'User-agent: Google-Extended'; $out[] = 'User-agent: Google-Extended';
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
$out[] = '# CCBot (ab)used to train LLMs'; $out[] = '# CCBot (ab)used to train LLMs';
$out[] = '# Ref: https://darkvisitors.com/agents/ccbot'; $out[] = '# Ref: https://darkvisitors.com/agents/ccbot';
$out[] = 'User-agent: CCBot'; $out[] = 'User-agent: CCBot';
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
$out[] = '# Facebook LLM Bot'; $out[] = '# Facebook LLM Bot';
$out[] = '# Ref: https://developers.facebook.com/docs/sharing/bot/'; $out[] = '# Ref: https://developers.facebook.com/docs/sharing/bot/';
$out[] = 'User-agent: FacebookBot'; $out[] = 'User-agent: FacebookBot';
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
$out[] = '# DiffBot, though this one is known to have option to ignore robotstxt'; $out[] = '# DiffBot, though this one is known to have option to ignore robotstxt';
$out[] = '# Ref https://issuetracker.revi.xyz/u/robotstxtdiffbot'; $out[] = '# Ref https://issuetracker.revi.xyz/u/robotstxtdiffbot';
$out[] = 'User-agent: Diffbot'; $out[] = 'User-agent: Diffbot';
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
$out[] = '# Bytespider'; $out[] = '# Bytespider';
$out[] = '# Ref: https://darkvisitors.com/agents/bytespider'; $out[] = '# Ref: https://darkvisitors.com/agents/bytespider';
$out[] = 'User-agent: Bytespider'; $out[] = 'User-agent: Bytespider';
$out[] = 'Disallow: /'; $out[] = 'Disallow: /';
// Crawl-delay entries at the bottom // Crawl-delay entries at the bottom
// Ref: https://github.com/otwcode/otwarchive/pull/4411#discussion_r1044351129 // Ref: https://github.com/otwcode/otwarchive/pull/4411#discussion_r1044351129
$out[] = 'User-agent: *'; $out[] = 'User-agent: *';
$out[] = 'Crawl-delay: 1'; $out[] = 'Crawl-delay: 1';
$content = implode("\n", $out)."\n"; $content = implode("\n", $out)."\n";
return id(new AphrontPlainTextResponse()) return id(new AphrontPlainTextResponse())
->setContent($content) ->setContent($content)
->setCacheDurationInSeconds(phutil_units('2 hours in seconds')) ->setCacheDurationInSeconds(phutil_units('2 hours in seconds'))
->setCanCDN(true) ->setCanCDN(true)
->setLastModified(1715266513); ->setLastModified(1715372847);
} }
} }