robotstxt: just use *
Summary: Fixes T138. Seems like Google doesn't care. Signed-off-by: Yongmin Hong <revi@omglol.email> Test Plan: Try http://go/google/search w/ updated robots.txt entries Reviewers: O1 revi & automations, revi Reviewed By: O1 revi & automations, revi Maniphest Tasks: T138 Differential Revision: https://issuetracker.revi.xyz/D75
This commit is contained in:
parent
a257d3ccb4
commit
467eb9b2d1
2 changed files with 103 additions and 123 deletions
|
@ -5,32 +5,12 @@ root = true
|
|||
[*]
|
||||
charset = utf-8
|
||||
end_of_line = lf
|
||||
indent_size = tab
|
||||
indent_style = tab
|
||||
indent_size = 2
|
||||
indent_style = space
|
||||
insert_final_newline = true
|
||||
tab_width = 4
|
||||
trim_trailing_whitespace = true
|
||||
|
||||
[*.md]
|
||||
indent_style = space
|
||||
indent_size = 2
|
||||
|
||||
[.arc*]
|
||||
indent_size = 2
|
||||
indent_style = space
|
||||
|
||||
# Tabs may not be valid YAML
|
||||
# @see https://yaml.org/spec/1.2/spec.html#id2777534
|
||||
[*.{css,js,json,yml,yaml}]
|
||||
indent_size = 2
|
||||
indent_style = space
|
||||
|
||||
# yaml-lint configuration
|
||||
# YAML files should be indented with spaces, which .yamllint is
|
||||
[.yamllint]
|
||||
indent_size = 2
|
||||
indent_style = space
|
||||
|
||||
[.git/**]
|
||||
indent_size = 2
|
||||
indent_style = space
|
||||
[.php]
|
||||
# Use tabs for php file indentation
|
||||
indent_style = tab
|
||||
tab_width = 4
|
||||
|
|
|
@ -6,109 +6,109 @@ abstract class PhabricatorRobotsController extends PhabricatorController {
|
|||
|
||||
public function shouldRequireLogin() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public function setClientIDCookie() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public function processRequest() {
|
||||
$out = array();
|
||||
public function processRequest() {
|
||||
$out = array();
|
||||
|
||||
$out[] = '# Adapted from phabricator.wikimedia.org, we.phorge.it';
|
||||
// Version timestamp is when I started editing them.
|
||||
// Edit setLastModified at the bottom as well.
|
||||
// Calculate EpochTime via go/epoch
|
||||
$out[] = '# version: 20240509T235513+0900';
|
||||
$out[] = '# also at https://github.com/revi/sandbox.git';
|
||||
$out[] = 'User-Agent: *';
|
||||
$out[] = 'Disallow: /diffusion/';
|
||||
$out[] = 'Disallow: /source/';
|
||||
$out[] = 'Disallow: /multimeter/';
|
||||
$out[] = 'Disallow: /policy/explain';
|
||||
$out[] = 'Disallow: /auth';
|
||||
$out[] = 'Disallow: /login';
|
||||
$out[] = 'Disallow: /maniphest/transaction';
|
||||
$out[] = 'Disallow: /tag';
|
||||
$out[] = 'Disallow: /search/query/all';
|
||||
$out[] = 'Disallow: /conduit';
|
||||
$out[] = 'Disallow: /api';
|
||||
$out[] = 'Disallow: /project';
|
||||
$out[] = 'Disallow: /applications';
|
||||
$out[] = 'Disallow: /token';
|
||||
$out[] = 'Disallow: /pholio';
|
||||
$out[] = 'Disallow: /dashboard';
|
||||
$out[] = 'Disallow: /calendar';
|
||||
$out[] = 'Disallow: /herald';
|
||||
// This is commits.
|
||||
$out[] = 'Disallow: /r*';
|
||||
// This is differential reviews
|
||||
$out[] = 'Disallow: /differential';
|
||||
$out[] = 'Disallow: /D*%24*';
|
||||
// This is Files. (F$)
|
||||
$out[] = 'Disallow: /file';
|
||||
$out[] = 'Disallow: /F*%24*';
|
||||
// This is pastes (P$)
|
||||
$out[] = 'Disallow: /paste';
|
||||
$out[] = 'Disallow: /P*%24*';
|
||||
// This is blog entries (J$)
|
||||
$out[] = 'Disallow: /phame';
|
||||
$out[] = 'Disallow: /J*%24*';
|
||||
// This is user list.
|
||||
// As of 2024-04-17 user list is behind auth but who knows it might change?
|
||||
$out[] = 'Disallow: /people';
|
||||
// This is user profile link.
|
||||
$out[] = 'Disallow: /p/';
|
||||
// Phorge specific entries end here.
|
||||
$out[] = '# This is cloudflare endpoint';
|
||||
$out[] = '# Ref: https://issuetracker.revi.xyz/u/cloudflarecdncgi';
|
||||
$out[] = 'Disallow: /cdn-cgi/';
|
||||
$out[] = '# Google Ads are not welcome';
|
||||
$out[] = 'User-agent: Mediapartners-Google';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = 'User-agent: AdsBot-Google';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = 'User-agent: AdsBot-Google-Mobile';
|
||||
$out[] = 'Disallow: /';
|
||||
// While I sometimes (borderline 'rare') use LLMs (GPT, Gemini, …), I'd rather prefer LLMs not use my stuff to profit
|
||||
// Well I think my stuff is mostly out of interest for them, tho…
|
||||
$out[] = '# ChatGPT Crawlers are not welcome';
|
||||
$out[] = '# Ref: https://platform.openai.com/docs/plugins/bot';
|
||||
$out[] = 'User-agent: ChatGPT-User';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = 'User-agent: GPTBot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Google Gemini AI Crawlers are also not welcome';
|
||||
$out[] = '# Ref: https://issuetracker.revi.xyz/u/googleextended';
|
||||
$out[] = 'User-agent: Google-Extended';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# CCBot (ab)used to train LLMs';
|
||||
$out[] = '# Ref: https://darkvisitors.com/agents/ccbot';
|
||||
$out[] = 'User-agent: CCBot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Facebook LLM Bot';
|
||||
$out[] = '# Ref: https://developers.facebook.com/docs/sharing/bot/';
|
||||
$out[] = 'User-agent: FacebookBot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# DiffBot, though this one is known to have option to ignore robotstxt';
|
||||
$out[] = '# Ref https://issuetracker.revi.xyz/u/robotstxtdiffbot';
|
||||
$out[] = 'User-agent: Diffbot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Bytespider';
|
||||
$out[] = '# Ref: https://darkvisitors.com/agents/bytespider';
|
||||
$out[] = 'User-agent: Bytespider';
|
||||
$out[] = 'Disallow: /';
|
||||
// Crawl-delay entries at the bottom
|
||||
// Ref: https://github.com/otwcode/otwarchive/pull/4411#discussion_r1044351129
|
||||
$out[] = 'User-agent: *';
|
||||
$out[] = 'Crawl-delay: 1';
|
||||
$out[] = '# Adapted from phabricator.wikimedia.org, we.phorge.it';
|
||||
// Version timestamp is when I started editing them.
|
||||
// Edit setLastModified at the bottom as well.
|
||||
// Calculate EpochTime via go/epoch
|
||||
$out[] = '# version: 20240511T052727+0900';
|
||||
$out[] = '# also at https://github.com/revi/sandbox.git';
|
||||
$out[] = 'User-Agent: *';
|
||||
$out[] = 'Disallow: /diffusion/';
|
||||
$out[] = 'Disallow: /source/';
|
||||
$out[] = 'Disallow: /multimeter/';
|
||||
$out[] = 'Disallow: /policy/explain';
|
||||
$out[] = 'Disallow: /auth';
|
||||
$out[] = 'Disallow: /login';
|
||||
$out[] = 'Disallow: /maniphest/transaction';
|
||||
$out[] = 'Disallow: /tag';
|
||||
$out[] = 'Disallow: /search/query/all';
|
||||
$out[] = 'Disallow: /conduit';
|
||||
$out[] = 'Disallow: /api';
|
||||
$out[] = 'Disallow: /project';
|
||||
$out[] = 'Disallow: /applications';
|
||||
$out[] = 'Disallow: /token';
|
||||
$out[] = 'Disallow: /pholio';
|
||||
$out[] = 'Disallow: /dashboard';
|
||||
$out[] = 'Disallow: /calendar';
|
||||
$out[] = 'Disallow: /herald';
|
||||
// This is commits.
|
||||
$out[] = 'Disallow: /r*';
|
||||
// This is differential revisions. (D*)
|
||||
$out[] = 'Disallow: /differential';
|
||||
$out[] = 'Disallow: /D*';
|
||||
// This is Files. (F*)
|
||||
$out[] = 'Disallow: /file';
|
||||
$out[] = 'Disallow: /F*';
|
||||
// This is pastes (P*)
|
||||
$out[] = 'Disallow: /paste';
|
||||
$out[] = 'Disallow: /P*';
|
||||
// This is blog entries (J$)
|
||||
$out[] = 'Disallow: /phame';
|
||||
$out[] = 'Disallow: /J*';
|
||||
// This is user list.
|
||||
// As of 2024-04-17 user list is behind auth but who knows it might change?
|
||||
$out[] = 'Disallow: /people';
|
||||
// This is user profile link.
|
||||
$out[] = 'Disallow: /p/';
|
||||
// Phorge specific entries end here.
|
||||
$out[] = '# This is cloudflare endpoint';
|
||||
$out[] = '# Ref: https://issuetracker.revi.xyz/u/cloudflarecdncgi';
|
||||
$out[] = 'Disallow: /cdn-cgi/';
|
||||
$out[] = '# Google Ads are not welcome';
|
||||
$out[] = 'User-agent: Mediapartners-Google';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = 'User-agent: AdsBot-Google';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = 'User-agent: AdsBot-Google-Mobile';
|
||||
$out[] = 'Disallow: /';
|
||||
// While I sometimes (borderline 'rare') use LLMs (GPT, Gemini, …), I'd rather prefer LLMs not use my stuff to profit
|
||||
// Well I think my stuff is mostly out of interest for them, tho…
|
||||
$out[] = '# ChatGPT Crawlers are not welcome';
|
||||
$out[] = '# Ref: https://platform.openai.com/docs/plugins/bot';
|
||||
$out[] = 'User-agent: ChatGPT-User';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = 'User-agent: GPTBot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Google Gemini AI Crawlers are also not welcome';
|
||||
$out[] = '# Ref: https://issuetracker.revi.xyz/u/googleextended';
|
||||
$out[] = 'User-agent: Google-Extended';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# CCBot (ab)used to train LLMs';
|
||||
$out[] = '# Ref: https://darkvisitors.com/agents/ccbot';
|
||||
$out[] = 'User-agent: CCBot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Facebook LLM Bot';
|
||||
$out[] = '# Ref: https://developers.facebook.com/docs/sharing/bot/';
|
||||
$out[] = 'User-agent: FacebookBot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# DiffBot, though this one is known to have option to ignore robotstxt';
|
||||
$out[] = '# Ref https://issuetracker.revi.xyz/u/robotstxtdiffbot';
|
||||
$out[] = 'User-agent: Diffbot';
|
||||
$out[] = 'Disallow: /';
|
||||
$out[] = '# Bytespider';
|
||||
$out[] = '# Ref: https://darkvisitors.com/agents/bytespider';
|
||||
$out[] = 'User-agent: Bytespider';
|
||||
$out[] = 'Disallow: /';
|
||||
// Crawl-delay entries at the bottom
|
||||
// Ref: https://github.com/otwcode/otwarchive/pull/4411#discussion_r1044351129
|
||||
$out[] = 'User-agent: *';
|
||||
$out[] = 'Crawl-delay: 1';
|
||||
|
||||
$content = implode("\n", $out)."\n";
|
||||
$content = implode("\n", $out)."\n";
|
||||
|
||||
return id(new AphrontPlainTextResponse())
|
||||
->setContent($content)
|
||||
->setCacheDurationInSeconds(phutil_units('2 hours in seconds'))
|
||||
->setCanCDN(true)
|
||||
->setLastModified(1715266513);
|
||||
return id(new AphrontPlainTextResponse())
|
||||
->setContent($content)
|
||||
->setCacheDurationInSeconds(phutil_units('2 hours in seconds'))
|
||||
->setCanCDN(true)
|
||||
->setLastModified(1715372847);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue