aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCullum Smith <cullum@sacredheartsc.com>2025-12-05 12:55:46 -0500
committerCullum Smith <cullum@sacredheartsc.com>2025-12-05 12:55:46 -0500
commitac216a9476eaf0f64105e992a7d8d461ccda9d69 (patch)
tree5767b0e382e2db6d7188f1200adce14a8efafd0b
parent5ebf903ec5b9a8d5a569c5e6ceb6fb24adabf8ed (diff)
downloadinfrastructure-ac216a9476eaf0f64105e992a7d8d461ccda9d69.tar.gz
block AI slop
-rw-r--r--files/usr/local/etc/nginx/noai.conf.common12
-rw-r--r--files/usr/local/etc/nginx/vhosts.conf.git_server6
-rw-r--r--files/usr/local/etc/pkg/repos/FreeBSD.conf.common2
-rw-r--r--files/usr/local/www/cgit/custom-robots.txt.git_server2
-rw-r--r--scripts/hostclass/git_server7
-rw-r--r--scripts/hostclass/public_webserver4
m---------site0
-rw-r--r--vars/common/noai136
-rw-r--r--vars/common/vars (renamed from vars/common)0
9 files changed, 166 insertions, 3 deletions
diff --git a/files/usr/local/etc/nginx/noai.conf.common b/files/usr/local/etc/nginx/noai.conf.common
new file mode 100644
index 0000000..31db76d
--- /dev/null
+++ b/files/usr/local/etc/nginx/noai.conf.common
@@ -0,0 +1,12 @@
+set \$block 0;
+if (\$http_user_agent ~* "${nginx_ai_regex}") {
+ set \$block 1;
+}
+
+if (\$request_uri = "/robots.txt") {
+ set \$block 0;
+}
+
+if (\$block) {
+ return 403;
+}
diff --git a/files/usr/local/etc/nginx/vhosts.conf.git_server b/files/usr/local/etc/nginx/vhosts.conf.git_server
index 0d24050..0f6baf6 100644
--- a/files/usr/local/etc/nginx/vhosts.conf.git_server
+++ b/files/usr/local/etc/nginx/vhosts.conf.git_server
@@ -24,6 +24,8 @@ fi)
root ${cgit_webroot};
try_files \$uri @cgit;
+ include noai.conf;
+
location ~ '^.+/(HEAD|info/refs|objects/(info/[^/]+|[0-9a-f]{2}/[0-9a-f]{38}|pack/pack-[0-9a-f]{40}\.(pack|idx))|git-(upload|receive)-pack)$' {
auth_gss on;
satisfy any;
@@ -39,6 +41,10 @@ $(printf ' deny %s;\n' $kerberized_cidrs)
fastcgi_pass unix:${gitolite_fcgiwrap_socket};
}
+ location /robots.txt {
+ alias ${cgit_webroot}/custom-robots.txt;
+ }
+
location /custom-style.css {
add_header Cache-Control "public";
expires 1d;
diff --git a/files/usr/local/etc/pkg/repos/FreeBSD.conf.common b/files/usr/local/etc/pkg/repos/FreeBSD.conf.common
index 22521b5..fa41366 100644
--- a/files/usr/local/etc/pkg/repos/FreeBSD.conf.common
+++ b/files/usr/local/etc/pkg/repos/FreeBSD.conf.common
@@ -1 +1,3 @@
FreeBSD: { enabled: no }
+
+FreeBSD-kmods: { enabled: no }
diff --git a/files/usr/local/www/cgit/custom-robots.txt.git_server b/files/usr/local/www/cgit/custom-robots.txt.git_server
index 1b33266..a0cacd2 100644
--- a/files/usr/local/www/cgit/custom-robots.txt.git_server
+++ b/files/usr/local/www/cgit/custom-robots.txt.git_server
@@ -1,3 +1,5 @@
+$(echo "$bad_user_agents" | while read -r ua; do [ -n "$ua" ] && printf 'User-agent: %s\nDisallow: /\n\n' "$ua"; done)
+
User-agent: *
Disallow: /*/snapshot/*
Disallow: /*/blame/*
diff --git a/scripts/hostclass/git_server b/scripts/hostclass/git_server
index ee576e5..65d8264 100644
--- a/scripts/hostclass/git_server
+++ b/scripts/hostclass/git_server
@@ -94,13 +94,16 @@ install_file -m 0644 \
"${cgit_webroot}/custom-style.css" \
"${cgit_webroot}/custom-favicon.ico" \
"${cgit_webroot}/custom-logo.png" \
- "${cgit_webroot}/custom-robots.txt" \
"${cgit_webroot}/custom-head-include.html" \
"${cgit_webroot}/custom-header.html"
+install_template -m 0644 \
+ "${cgit_webroot}/custom-robots.txt"
# Generate nginx configuration.
install_file -m 0644 /usr/local/etc/nginx/fastcgi_params
-install_template -m 0644 /usr/local/etc/nginx/nginx.conf
+install_template -m 0644 \
+ /usr/local/etc/nginx/nginx.conf \
+ /usr/local/etc/nginx/noai.conf
[ -f "${nginx_conf_dir}/vhosts.conf" ] || install -Cv -m 0644 /dev/null "${nginx_conf_dir}/vhosts.conf"
sysrc -v nginx_enable=YES
service nginx restart
diff --git a/scripts/hostclass/public_webserver b/scripts/hostclass/public_webserver
index bce14e5..061e99c 100644
--- a/scripts/hostclass/public_webserver
+++ b/scripts/hostclass/public_webserver
@@ -20,7 +20,9 @@ zfs set \
"${state_dataset}/vhosts"
# Configure nginx.
-install_template -m 0644 "${nginx_conf_dir}/nginx.conf"
+install_template -m 0644 \
+ "${nginx_conf_dir}/nginx.conf" \
+ "${nginx_conf_dir}/noai.conf"
[ -f "${nginx_conf_dir}/vhosts.conf" ] || install -Cv -m 0644 /dev/null "${nginx_conf_dir}/vhosts.conf"
sysrc -v nginx_enable=YES
service nginx restart
diff --git a/site b/site
-Subproject bf744e66c2e6d45aa63fd81113939872551df89
+Subproject 924dbe60bd038cdd26f06a7360450e549e483fb
diff --git a/vars/common/noai b/vars/common/noai
new file mode 100644
index 0000000..6ef03d8
--- /dev/null
+++ b/vars/common/noai
@@ -0,0 +1,136 @@
+#!/bin/sh
+
+# Variables related to blocking AI slop go in here. Sourced from:
+# https://github.com/ai-robots-txt/ai.robots.txt
+
+bad_user_agents='
+AddSearchBot
+AI2Bot
+AI2Bot-DeepResearchEval
+Ai2Bot-Dolma
+aiHitBot
+amazon-kendra
+Amazonbot
+AmazonBuyForMe
+Andibot
+Anomura
+anthropic-ai
+Applebot
+Applebot-Extended
+atlassian-bot
+Awario
+bedrockbot
+bigsur.ai
+Bravebot
+Brightbot 1.0
+BuddyBot
+Bytespider
+CCBot
+ChatGLM-Spider
+ChatGPT Agent
+ChatGPT-User
+Claude-SearchBot
+Claude-User
+Claude-Web
+ClaudeBot
+Cloudflare-AutoRAG
+CloudVertexBot
+cohere-ai
+cohere-training-data-crawler
+Cotoyogi
+Crawl4AI
+Crawlspace
+Datenbank Crawler
+DeepSeekBot
+Devin
+Diffbot
+DuckAssistBot
+Echobot Bot
+EchoboxBot
+FacebookBot
+facebookexternalhit
+Factset_spyderbot
+FirecrawlAgent
+FriendlyCrawler
+Gemini-Deep-Research
+Google-CloudVertexBot
+Google-Extended
+Google-Firebase
+Google-NotebookLM
+GoogleAgent-Mariner
+GoogleOther
+GoogleOther-Image
+GoogleOther-Video
+GPTBot
+iAskBot
+iaskspider
+iaskspider/2.0
+IbouBot
+ICC-Crawler
+ImagesiftBot
+imageSpider
+img2dataset
+ISSCyberRiskCrawler
+Kangaroo Bot
+KlaviyoAIBot
+KunatoCrawler
+laion-huggingface-processor
+LAIONDownloader
+LCC
+LinerBot
+Linguee Bot
+LinkupBot
+Manus-User
+meta-externalagent
+Meta-ExternalAgent
+meta-externalfetcher
+Meta-ExternalFetcher
+meta-webindexer
+MistralAI-User
+MistralAI-User/1.0
+MyCentralAIScraperBot
+netEstate Imprint Crawler
+NotebookLM
+NovaAct
+OAI-SearchBot
+omgili
+omgilibot
+OpenAI
+Operator
+PanguBot
+Panscient
+panscient.com
+Perplexity-User
+PerplexityBot
+PetalBot
+PhindBot
+Poggio-Citations
+Poseidon Research Crawler
+QualifiedBot
+QuillBot
+quillbot.com
+SBIntuitionsBot
+Scrapy
+SemrushBot-OCOB
+SemrushBot-SWA
+ShapBot
+Sidetrade indexer bot
+Spider
+TerraCotta
+Thinkbot
+TikTokSpider
+Timpibot
+VelenPublicWebCrawler
+WARDBot
+Webzio-Extended
+webzio-extended
+wpbot
+WRTNBot
+YaK
+YandexAdditional
+YandexAdditionalBot
+YouBot
+ZanistaBot
+'
+
+nginx_ai_regex='(AddSearchBot|AI2Bot|AI2Bot\-DeepResearchEval|Ai2Bot\-Dolma|aiHitBot|amazon\-kendra|Amazonbot|AmazonBuyForMe|Andibot|Anomura|anthropic\-ai|Applebot|Applebot\-Extended|atlassian\-bot|Awario|bedrockbot|bigsur\.ai|Bravebot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|ChatGLM\-Spider|ChatGPT\ Agent|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawl4AI|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iAskBot|iaskspider|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|imageSpider|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|KlaviyoAIBot|KunatoCrawler|laion\-huggingface\-processor|LAIONDownloader|LCC|LinerBot|Linguee\ Bot|LinkupBot|Manus\-User|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|netEstate\ Imprint\ Crawler|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poggio\-Citations|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Spider|TerraCotta|Thinkbot|TikTokSpider|Timpibot|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|webzio\-extended|wpbot|WRTNBot|YaK|YandexAdditional|YandexAdditionalBot|YouBot|ZanistaBot)'
diff --git a/vars/common b/vars/common/vars
index be8e34a..be8e34a 100644
--- a/vars/common
+++ b/vars/common/vars