diff options
| author | Cullum Smith <cullum@sacredheartsc.com> | 2025-12-05 12:55:46 -0500 |
|---|---|---|
| committer | Cullum Smith <cullum@sacredheartsc.com> | 2025-12-05 12:55:46 -0500 |
| commit | ac216a9476eaf0f64105e992a7d8d461ccda9d69 (patch) | |
| tree | 5767b0e382e2db6d7188f1200adce14a8efafd0b | |
| parent | 5ebf903ec5b9a8d5a569c5e6ceb6fb24adabf8ed (diff) | |
| download | infrastructure-ac216a9476eaf0f64105e992a7d8d461ccda9d69.tar.gz | |
block AI slop
| -rw-r--r-- | files/usr/local/etc/nginx/noai.conf.common | 12 | ||||
| -rw-r--r-- | files/usr/local/etc/nginx/vhosts.conf.git_server | 6 | ||||
| -rw-r--r-- | files/usr/local/etc/pkg/repos/FreeBSD.conf.common | 2 | ||||
| -rw-r--r-- | files/usr/local/www/cgit/custom-robots.txt.git_server | 2 | ||||
| -rw-r--r-- | scripts/hostclass/git_server | 7 | ||||
| -rw-r--r-- | scripts/hostclass/public_webserver | 4 | ||||
| m--------- | site | 0 | ||||
| -rw-r--r-- | vars/common/noai | 136 | ||||
| -rw-r--r-- | vars/common/vars (renamed from vars/common) | 0 |
9 files changed, 166 insertions, 3 deletions
diff --git a/files/usr/local/etc/nginx/noai.conf.common b/files/usr/local/etc/nginx/noai.conf.common new file mode 100644 index 0000000..31db76d --- /dev/null +++ b/files/usr/local/etc/nginx/noai.conf.common @@ -0,0 +1,12 @@ +set \$block 0; +if (\$http_user_agent ~* "${nginx_ai_regex}") { + set \$block 1; +} + +if (\$request_uri = "/robots.txt") { + set \$block 0; +} + +if (\$block) { + return 403; +} diff --git a/files/usr/local/etc/nginx/vhosts.conf.git_server b/files/usr/local/etc/nginx/vhosts.conf.git_server index 0d24050..0f6baf6 100644 --- a/files/usr/local/etc/nginx/vhosts.conf.git_server +++ b/files/usr/local/etc/nginx/vhosts.conf.git_server @@ -24,6 +24,8 @@ fi) root ${cgit_webroot}; try_files \$uri @cgit; + include noai.conf; + location ~ '^.+/(HEAD|info/refs|objects/(info/[^/]+|[0-9a-f]{2}/[0-9a-f]{38}|pack/pack-[0-9a-f]{40}\.(pack|idx))|git-(upload|receive)-pack)$' { auth_gss on; satisfy any; @@ -39,6 +41,10 @@ $(printf ' deny %s;\n' $kerberized_cidrs) fastcgi_pass unix:${gitolite_fcgiwrap_socket}; } + location /robots.txt { + alias ${cgit_webroot}/custom-robots.txt; + } + location /custom-style.css { add_header Cache-Control "public"; expires 1d; diff --git a/files/usr/local/etc/pkg/repos/FreeBSD.conf.common b/files/usr/local/etc/pkg/repos/FreeBSD.conf.common index 22521b5..fa41366 100644 --- a/files/usr/local/etc/pkg/repos/FreeBSD.conf.common +++ b/files/usr/local/etc/pkg/repos/FreeBSD.conf.common @@ -1 +1,3 @@ FreeBSD: { enabled: no } + +FreeBSD-kmods: { enabled: no } diff --git a/files/usr/local/www/cgit/custom-robots.txt.git_server b/files/usr/local/www/cgit/custom-robots.txt.git_server index 1b33266..a0cacd2 100644 --- a/files/usr/local/www/cgit/custom-robots.txt.git_server +++ b/files/usr/local/www/cgit/custom-robots.txt.git_server @@ -1,3 +1,5 @@ +$(echo "$bad_user_agents" | while read -r ua; do [ -n "$ua" ] && printf 'User-agent: %s\nDisallow: /\n\n' "$ua"; done) + User-agent: * Disallow: /*/snapshot/* Disallow: /*/blame/* diff --git a/scripts/hostclass/git_server b/scripts/hostclass/git_server index ee576e5..65d8264 100644 --- a/scripts/hostclass/git_server +++ b/scripts/hostclass/git_server @@ -94,13 +94,16 @@ install_file -m 0644 \ "${cgit_webroot}/custom-style.css" \ "${cgit_webroot}/custom-favicon.ico" \ "${cgit_webroot}/custom-logo.png" \ - "${cgit_webroot}/custom-robots.txt" \ "${cgit_webroot}/custom-head-include.html" \ "${cgit_webroot}/custom-header.html" +install_template -m 0644 \ + "${cgit_webroot}/custom-robots.txt" # Generate nginx configuration. install_file -m 0644 /usr/local/etc/nginx/fastcgi_params -install_template -m 0644 /usr/local/etc/nginx/nginx.conf +install_template -m 0644 \ + /usr/local/etc/nginx/nginx.conf \ + /usr/local/etc/nginx/noai.conf [ -f "${nginx_conf_dir}/vhosts.conf" ] || install -Cv -m 0644 /dev/null "${nginx_conf_dir}/vhosts.conf" sysrc -v nginx_enable=YES service nginx restart diff --git a/scripts/hostclass/public_webserver b/scripts/hostclass/public_webserver index bce14e5..061e99c 100644 --- a/scripts/hostclass/public_webserver +++ b/scripts/hostclass/public_webserver @@ -20,7 +20,9 @@ zfs set \ "${state_dataset}/vhosts" # Configure nginx. -install_template -m 0644 "${nginx_conf_dir}/nginx.conf" +install_template -m 0644 \ + "${nginx_conf_dir}/nginx.conf" \ + "${nginx_conf_dir}/noai.conf" [ -f "${nginx_conf_dir}/vhosts.conf" ] || install -Cv -m 0644 /dev/null "${nginx_conf_dir}/vhosts.conf" sysrc -v nginx_enable=YES service nginx restart diff --git a/site b/site -Subproject bf744e66c2e6d45aa63fd81113939872551df89 +Subproject 924dbe60bd038cdd26f06a7360450e549e483fb diff --git a/vars/common/noai b/vars/common/noai new file mode 100644 index 0000000..6ef03d8 --- /dev/null +++ b/vars/common/noai @@ -0,0 +1,136 @@ +#!/bin/sh + +# Variables related to blocking AI slop go in here. Sourced from: +# https://github.com/ai-robots-txt/ai.robots.txt + +bad_user_agents=' +AddSearchBot +AI2Bot +AI2Bot-DeepResearchEval +Ai2Bot-Dolma +aiHitBot +amazon-kendra +Amazonbot +AmazonBuyForMe +Andibot +Anomura +anthropic-ai +Applebot +Applebot-Extended +atlassian-bot +Awario +bedrockbot +bigsur.ai +Bravebot +Brightbot 1.0 +BuddyBot +Bytespider +CCBot +ChatGLM-Spider +ChatGPT Agent +ChatGPT-User +Claude-SearchBot +Claude-User +Claude-Web +ClaudeBot +Cloudflare-AutoRAG +CloudVertexBot +cohere-ai +cohere-training-data-crawler +Cotoyogi +Crawl4AI +Crawlspace +Datenbank Crawler +DeepSeekBot +Devin +Diffbot +DuckAssistBot +Echobot Bot +EchoboxBot +FacebookBot +facebookexternalhit +Factset_spyderbot +FirecrawlAgent +FriendlyCrawler +Gemini-Deep-Research +Google-CloudVertexBot +Google-Extended +Google-Firebase +Google-NotebookLM +GoogleAgent-Mariner +GoogleOther +GoogleOther-Image +GoogleOther-Video +GPTBot +iAskBot +iaskspider +iaskspider/2.0 +IbouBot +ICC-Crawler +ImagesiftBot +imageSpider +img2dataset +ISSCyberRiskCrawler +Kangaroo Bot +KlaviyoAIBot +KunatoCrawler +laion-huggingface-processor +LAIONDownloader +LCC +LinerBot +Linguee Bot +LinkupBot +Manus-User +meta-externalagent +Meta-ExternalAgent +meta-externalfetcher +Meta-ExternalFetcher +meta-webindexer +MistralAI-User +MistralAI-User/1.0 +MyCentralAIScraperBot +netEstate Imprint Crawler +NotebookLM +NovaAct +OAI-SearchBot +omgili +omgilibot +OpenAI +Operator +PanguBot +Panscient +panscient.com +Perplexity-User +PerplexityBot +PetalBot +PhindBot +Poggio-Citations +Poseidon Research Crawler +QualifiedBot +QuillBot +quillbot.com +SBIntuitionsBot +Scrapy +SemrushBot-OCOB +SemrushBot-SWA +ShapBot +Sidetrade indexer bot +Spider +TerraCotta +Thinkbot +TikTokSpider +Timpibot +VelenPublicWebCrawler +WARDBot +Webzio-Extended +webzio-extended +wpbot +WRTNBot +YaK +YandexAdditional +YandexAdditionalBot +YouBot +ZanistaBot +' + +nginx_ai_regex='(AddSearchBot|AI2Bot|AI2Bot\-DeepResearchEval|Ai2Bot\-Dolma|aiHitBot|amazon\-kendra|Amazonbot|AmazonBuyForMe|Andibot|Anomura|anthropic\-ai|Applebot|Applebot\-Extended|atlassian\-bot|Awario|bedrockbot|bigsur\.ai|Bravebot|Brightbot\ 1\.0|BuddyBot|Bytespider|CCBot|ChatGLM\-Spider|ChatGPT\ Agent|ChatGPT\-User|Claude\-SearchBot|Claude\-User|Claude\-Web|ClaudeBot|Cloudflare\-AutoRAG|CloudVertexBot|cohere\-ai|cohere\-training\-data\-crawler|Cotoyogi|Crawl4AI|Crawlspace|Datenbank\ Crawler|DeepSeekBot|Devin|Diffbot|DuckAssistBot|Echobot\ Bot|EchoboxBot|FacebookBot|facebookexternalhit|Factset_spyderbot|FirecrawlAgent|FriendlyCrawler|Gemini\-Deep\-Research|Google\-CloudVertexBot|Google\-Extended|Google\-Firebase|Google\-NotebookLM|GoogleAgent\-Mariner|GoogleOther|GoogleOther\-Image|GoogleOther\-Video|GPTBot|iAskBot|iaskspider|iaskspider/2\.0|IbouBot|ICC\-Crawler|ImagesiftBot|imageSpider|img2dataset|ISSCyberRiskCrawler|Kangaroo\ Bot|KlaviyoAIBot|KunatoCrawler|laion\-huggingface\-processor|LAIONDownloader|LCC|LinerBot|Linguee\ Bot|LinkupBot|Manus\-User|meta\-externalagent|Meta\-ExternalAgent|meta\-externalfetcher|Meta\-ExternalFetcher|meta\-webindexer|MistralAI\-User|MistralAI\-User/1\.0|MyCentralAIScraperBot|netEstate\ Imprint\ Crawler|NotebookLM|NovaAct|OAI\-SearchBot|omgili|omgilibot|OpenAI|Operator|PanguBot|Panscient|panscient\.com|Perplexity\-User|PerplexityBot|PetalBot|PhindBot|Poggio\-Citations|Poseidon\ Research\ Crawler|QualifiedBot|QuillBot|quillbot\.com|SBIntuitionsBot|Scrapy|SemrushBot\-OCOB|SemrushBot\-SWA|ShapBot|Sidetrade\ indexer\ bot|Spider|TerraCotta|Thinkbot|TikTokSpider|Timpibot|VelenPublicWebCrawler|WARDBot|Webzio\-Extended|webzio\-extended|wpbot|WRTNBot|YaK|YandexAdditional|YandexAdditionalBot|YouBot|ZanistaBot)' diff --git a/vars/common b/vars/common/vars index be8e34a..be8e34a 100644 --- a/vars/common +++ b/vars/common/vars |
