<?php
// Copyright (C) 2024 Hong Yongmin <https://revi.xyz/>
// SPDX-License-Identifier: Apache-2.0

abstract class PhabricatorRobotsController extends PhabricatorController {

	public function shouldRequireLogin() {
		return false;
	}

	// TODO: Different content for cdn domains

	public function processRequest() {
		$out = array();

		$out[] = '# Adapted from phabricator.wikimedia.org, we.phorge.it';
		// Version timestamp is when I started editing them.
		// Edit setLastModified at the bottom as well.
		// Calculate EpochTime via go/epoch
		$out[] = '# version: 20240616T191900+0900';
		$out[] = '# also at https://github.com/revi/sandbox.git';
		$out[] = 'User-Agent: *';
		$out[] = 'Disallow: /diffusion/';
		$out[] = 'Disallow: /source/';
		$out[] = 'Disallow: /multimeter/';
		$out[] = 'Disallow: /policy/explain';
		$out[] = 'Disallow: /auth';
		$out[] = 'Disallow: /login';
		$out[] = 'Disallow: /maniphest/transaction';
		$out[] = 'Disallow: /tag';
		$out[] = 'Disallow: /search/';
		$out[] = 'Disallow: /conduit';
		$out[] = 'Disallow: /api';
		$out[] = 'Disallow: /project';
		$out[] = 'Disallow: /applications';
		$out[] = 'Disallow: /token';
		$out[] = 'Disallow: /pholio';
		$out[] = 'Disallow: /dashboard';
		$out[] = 'Disallow: /calendar';
		$out[] = 'Disallow: /herald';
		// This is commits.
		$out[] = 'Disallow: /r*';
		// This is differential revisions. (D*)
		$out[] = 'Disallow: /differential';
		$out[] = 'Disallow: /D*';
		// This is Files. (F*)
		$out[] = 'Disallow: /file';
		$out[] = 'Disallow: /F*';
		// This is pastes (P*)
		$out[] = 'Disallow: /paste';
		$out[] = 'Disallow: /P*';
		// This is blog entries (J$)
		$out[] = 'Disallow: /phame';
		$out[] = 'Disallow: /J*';
		// This is user list.
		// As of 2024-04-17 user list is behind auth but who knows it might change?
		$out[] = 'Disallow: /people';
		// This is user profile link.
		$out[] = 'Disallow: /p/';
		// Phorge specific entries end here.
		$out[] = '# This is cloudflare endpoint';
		$out[] = '# Ref: https://issuetracker.revi.xyz/u/cloudflarecdncgi';
		$out[] = 'Disallow: /cdn-cgi/';
		$out[] = '# Google Ads are not welcome';
		$out[] = 'User-agent: Mediapartners-Google';
		$out[] = 'Disallow: /';
		$out[] = 'User-agent: AdsBot-Google';
		$out[] = 'Disallow: /';
		$out[] = 'User-agent: AdsBot-Google-Mobile';
		$out[] = 'Disallow: /';
		// While I sometimes (borderline 'rare') use LLMs (GPT, Gemini, …), I'd rather prefer LLMs not use my stuff to profit
		// Well I think my stuff is mostly out of interest for them, tho…
		$out[] = '# ChatGPT Crawlers are not welcome';
		$out[] = '# Ref: https://platform.openai.com/docs/plugins/bot';
		$out[] = 'User-agent: ChatGPT-User';
		$out[] = 'Disallow: /';
		$out[] = 'User-agent: GPTBot';
		$out[] = 'Disallow: /';
		$out[] = '# Google Gemini AI Crawlers are also not welcome';
		$out[] = '# Ref: https://issuetracker.revi.xyz/u/googleextended';
		$out[] = 'User-agent: Google-Extended';
		$out[] = 'Disallow: /';
		$out[] = '# CCBot (ab)used to train LLMs';
		$out[] = '# Ref: https://darkvisitors.com/agents/ccbot';
		$out[] = 'User-agent: CCBot';
		$out[] = 'Disallow: /';
		$out[] = '# Facebook LLM Bot';
		$out[] = '# Ref: https://developers.facebook.com/docs/sharing/bot/';
		$out[] = 'User-agent: FacebookBot';
		$out[] = 'Disallow: /';
		$out[] = '# DiffBot, though this one is known to have option to ignore robotstxt';
		$out[] = '# Ref https://issuetracker.revi.xyz/u/robotstxtdiffbot';
		$out[] = 'User-agent: Diffbot';
		$out[] = 'Disallow: /';
		$out[] = '# Bytespider';
		$out[] = '# Ref: https://darkvisitors.com/agents/bytespider';
		$out[] = 'User-agent: Bytespider';
		$out[] = 'Disallow: /';
		$out[] = '# Block PetalBot, misbehaving';
		$out[] = 'User-agent: PetalBot';
		$out[] = 'Disallow: /';
		$out[] = '# Block peer39';
		$out[] = 'User-agent: peer39_crawler';
		$out[] = 'User-agent: peer39_crawler/1.0';
		$out[] = 'Disallow: /';
		$out[] = '# Block SemRushBot';
		$out[] = 'User-agent: SemrushBot';
		$out[] = 'Disallow: /';
		$out[] = '# Block AhrefsBot';
		$out[] = 'User-agent: AhrefsBot';
		$out[] = 'Disallow: /';
		$out[] = '# See https://revi.xyz/robots.txt for rationales';
		$out[] = 'User-agent: TurnitinBot';
		$out[] = 'Disallow: /';
		$out[] = 'User-agent: NPBot';
		$out[] = 'Disallow: /';
		$out[] = 'User-agent: SlySearch';
		$out[] = 'Disallow: /';
		$out[] = 'User-agent: BLEXBot';
		$out[] = 'Disallow: /';
		$out[] = 'User-agent: BrandVerity/1.0';
		$out[] = 'Disallow: /';
		// Crawl-delay entries at the bottom
		// Ref: https://github.com/otwcode/otwarchive/pull/4411#discussion_r1044351129
		$out[] = 'User-agent: *';
		$out[] = 'Crawl-delay: 1';

		$content = implode("\n", $out)."\n";

		return id(new AphrontPlainTextResponse())
			->setContent($content)
			->setCacheDurationInSeconds(phutil_units('2 hours in seconds'))
			->setClientIDCookie(false)
			->setCanCDN(true)
			->setLastModified(1718533140);
	}
}