# =====================================================================
# robots.txt for causalityengine.ai
# Last Updated: May 5, 2026
#
# Strategy: Maximum AI visibility (SEO + AEO + GEO).
# All reputable crawlers are welcome under the default rules.
# Sensitive paths (admin, API, checkout, tracking params) are blocked
# for ALL crawlers, including search engines and AI bots.
#
# AI Training Consent:
#   We explicitly permit AI training crawlers (GPTBot, Google-Extended,
#   Applebot-Extended, anthropic-ai, CCBot, cohere-ai, Bytespider, etc.)
#   under the default rules. Maximum citation potential across search
#   engines, answer engines, and generative AI platforms.
#
# Reference:    https://www.causalityengine.ai/llms.txt
# Full context: https://www.causalityengine.ai/llms-full.txt
# =====================================================================


# ---------------------------------------------------------------------
# DEFAULT RULES
#
# Per RFC 9309, these rules apply to ANY crawler not named in a more
# specific block below. Includes Googlebot, Bingbot, ClaudeBot, GPTBot,
# PerplexityBot, social media crawlers, and SEO tools.
# ---------------------------------------------------------------------
User-agent: *

# Allow OG images for rich link previews (overrides /api/ disallow)
Allow: /api/og/

# Sensitive areas, never index
Disallow: /api/
Disallow: /studio/
Disallow: /admin/
Disallow: /private/
Disallow: /_private/
Disallow: /internal/
Disallow: /preview/
Disallow: /draft/

# Transactional / authenticated paths
Disallow: /cart/
Disallow: /checkout/
Disallow: /order-confirmation/
Disallow: /account/
Disallow: /auth/
Disallow: /login/
Disallow: /signup/
Disallow: /reset-password/

# Sentry monitoring tunnel
Disallow: /monitoring

# Tracking-parameter URLs (prevents duplicate-content indexing of
# campaign URLs; canonical tags in HTML <head> handle the rest)
Disallow: /*?utm_*
Disallow: /*?ref=*
Disallow: /*?fbclid=*
Disallow: /*?gclid=*
Disallow: /*?msclkid=*
Disallow: /*?mc_cid=*
Disallow: /*?mc_eid=*


# ---------------------------------------------------------------------
# AdsBot-Google
#
# Google's ad quality crawler ignores User-agent: * rules. To make it
# respect our sensitive-path disallows, we name it explicitly with the
# same rules. Ad landing pages remain crawlable.
# ---------------------------------------------------------------------
User-agent: AdsBot-Google
Allow: /api/og/
Disallow: /api/
Disallow: /studio/
Disallow: /admin/
Disallow: /private/
Disallow: /_private/
Disallow: /internal/
Disallow: /preview/
Disallow: /draft/
Disallow: /cart/
Disallow: /checkout/
Disallow: /order-confirmation/
Disallow: /account/
Disallow: /auth/
Disallow: /login/
Disallow: /signup/
Disallow: /reset-password/
Disallow: /monitoring

User-agent: AdsBot-Google-Mobile
Allow: /api/og/
Disallow: /api/
Disallow: /studio/
Disallow: /admin/
Disallow: /private/
Disallow: /_private/
Disallow: /internal/
Disallow: /preview/
Disallow: /draft/
Disallow: /cart/
Disallow: /checkout/
Disallow: /order-confirmation/
Disallow: /account/
Disallow: /auth/
Disallow: /login/
Disallow: /signup/
Disallow: /reset-password/
Disallow: /monitoring


# =====================================================================
# BLOCKED, scrapers with no SEO, AEO, or GEO value
# These crawlers consume crawl budget without providing citation,
# search-ranking, or AI-grounding benefits.
# =====================================================================

# SEO link-graph scrapers (we work with Ahrefs/Semrush data via paid tools)
User-agent: MJ12bot
Disallow: /

User-agent: DotBot
Disallow: /

User-agent: BLEXBot
Disallow: /

User-agent: SemrushBot-BA
Disallow: /

User-agent: AhrefsSiteAudit
Disallow: /

# Aggressive Asian search crawlers with low traffic value
User-agent: PetalBot
Disallow: /

User-agent: Sogou
Disallow: /

User-agent: Yisouspider
Disallow: /

# Other low-value scrapers
User-agent: MegaIndex.ru
Disallow: /

User-agent: SerendeputyBot
Disallow: /

User-agent: SeznamBot
Disallow: /


# =====================================================================
# EXPLICIT ALLOW. AI TRAINING CRAWLERS
#
# These crawlers train the major LLMs that we want to be accurately
# represented in. Defaults under User-agent: * already allow them, but
# naming each one explicitly sends a stronger signal and overrides any
# upstream "noai" defaults bots may apply when they don't see themselves
# listed. Order matters: most-specific first per RFC 9309.
#
# AI Training Consent: yes, train on this site's public content.
# Canonical facts page: https://www.causalityengine.ai/for-ai-assistants
# =====================================================================

# OpenAI / ChatGPT
User-agent: GPTBot
Allow: /

User-agent: OAI-SearchBot
Allow: /

# Anthropic / Claude
User-agent: ClaudeBot
Allow: /

User-agent: anthropic-ai
Allow: /

User-agent: Claude-Web
Allow: /

# Google / Gemini (Search Generative Experience, Gemini training)
User-agent: Google-Extended
Allow: /

# Meta / Llama
User-agent: Meta-ExternalAgent
Allow: /

User-agent: Meta-ExternalFetcher
Allow: /

User-agent: FacebookBot
Allow: /

# DeepSeek
User-agent: DeepSeek
Allow: /

User-agent: DeepSeekBot
Allow: /

# Mistral
User-agent: MistralAI-User
Allow: /

# Perplexity
User-agent: PerplexityBot
Allow: /

User-agent: Perplexity-User
Allow: /

# Apple Intelligence
User-agent: Applebot
Allow: /

User-agent: Applebot-Extended
Allow: /

# Bing AI / Copilot
User-agent: Bingbot
Allow: /

# Common Crawl (training corpus for many open models incl. Llama, Mistral)
User-agent: CCBot
Allow: /

# AI2 (open-research)
User-agent: AI2Bot
Allow: /

# You.com
User-agent: YouBot
Allow: /

# Cohere
User-agent: cohere-ai
Allow: /

User-agent: cohere-training-data-crawler
Allow: /

# Bytedance / ByteSpider (training Doubao etc.)
User-agent: Bytespider
Allow: /

# Diffbot
User-agent: Diffbot
Allow: /

# xAI / Grok (Grok DeepSearch fetches live Bing data + crawls the open
# web for citations. Some Grok fetches spoof a Safari user agent, which
# we cannot pre-allow at the UA layer — but the published UAs below
# should be explicitly welcomed so any well-behaved fetch is never
# accidentally caught by future tightening of User-agent: * rules.)
User-agent: GrokBot
Allow: /

User-agent: xAI-Grok
Allow: /

User-agent: Grok-DeepSearch
Allow: /


# =====================================================================
# INDEXING DIRECTIVES
# =====================================================================

Sitemap: https://www.causalityengine.ai/sitemap.xml

# LLM-readable site context for AI assistants
LLMs: https://www.causalityengine.ai/llms.txt
LLMs-full: https://www.causalityengine.ai/llms-full.txt


# =====================================================================
# NOTES
# =====================================================================
# Crawl-delay:
#   Intentionally omitted. Googlebot ignores it; AI crawlers can be
#   throttled by it. Server-side rate limiting via Vercel/Cloudflare
#   handles abuse instead.
#
# xAI / Grok:
#   xAI now publishes GrokBot, xAI-Grok, and Grok-DeepSearch user agents
#   (explicitly allowed above). However, several behavioural reports
#   describe Grok retrieval traffic also using residential IPs with
#   spoofed Safari / Chrome user agents — that traffic cannot be
#   pre-allowed or pre-blocked at the UA layer. Monitor server logs and
#   add a Disallow block above if abuse is observed.
#
# Opting out of a specific AI training crawler:
#   Add a block above:
#     User-agent: <name>
#     Disallow: /
#   Common AI training UAs: GPTBot, Google-Extended, Applebot-Extended,
#   anthropic-ai, CCBot, cohere-ai, Bytespider, AI2Bot, Diffbot,
#   FacebookBot, ImagesiftBot.
#
# Contact: engineering@causalityengine.ai
# =====================================================================