# ArchGenie — public crawler policy
# Last reviewed: 2026-04-29 (X-049)
#
# Default: allow general crawling, disallow auth-walled and API surfaces.
User-agent: *
Allow: /
Disallow: /workspace
Disallow: /api/

# === BLOCKED: training-data crawlers ===
# These crawlers exist to scrape content for LLM training corpora.
# We don't want our marketing content, FAQ, capability descriptions,
# or generated examples in commercial training datasets.
User-agent: GPTBot
Disallow: /

User-agent: CCBot
Disallow: /

User-agent: Google-Extended
Disallow: /

User-agent: anthropic-ai
Disallow: /

User-agent: Bytespider
Disallow: /

# === ALLOWED: AI-assistant runtime user-agents ===
# These crawlers exist to fetch a page on behalf of a real user
# asking the assistant a question RIGHT NOW. Allowing them means
# when developers ask "what's a good AI cloud architecture tool",
# the assistant can fetch our marketing pages and cite us.
# Explicit Allow: needed because the global User-agent: * default
# already allows them; we list them here for documentation/intent.
User-agent: ChatGPT-User
Allow: /
Disallow: /workspace
Disallow: /api/

User-agent: Claude-Web
Allow: /
Disallow: /workspace
Disallow: /api/

User-agent: PerplexityBot
Allow: /
Disallow: /workspace
Disallow: /api/

User-agent: Google-CloudVertexBot
Allow: /
Disallow: /workspace
Disallow: /api/

# === Open Graph / Link Previews ===
# Allow social platforms to fetch OG metadata for link previews.
User-agent: facebookexternalhit
Allow: /
Disallow: /workspace
Disallow: /api/

User-agent: Twitterbot
Allow: /
Disallow: /workspace
Disallow: /api/

User-agent: LinkedInBot
Allow: /
Disallow: /workspace
Disallow: /api/

# === No exceptions ===
# (FacebookBot was previously blocked; superseded by facebookexternalhit allow above.
# Bytespider stays blocked as a training crawler.)

Sitemap: https://archgenie.io/sitemap.xml