Initial commit: SIC harness (backend, web, pi-adapter, configs, docs)

- pnpm monorepo: apps/api (Fastify + SQLite + SSE), apps/web (React+Vite), packages/shared, packages/pi-adapter - Local auth (admin/webhook-runner roles) + Keycloak JWT ready - Multi-session chat with reliable history (user persisted before LLM, assistant persisted after stream) - Markdown knowledge base with /api/docs/search + /api/docs/:id - YAML webhook catalog with backend-only execution, retry/backoff, audit (webhook_runs), and per-user rate limit - Skills config (sre-on-call, blameless-postmortem, security-incident) injected into LLM system prompt - LLM provider failover chain (config/models.yml fallback + LLM_FALLBACK_CHAIN override) - Context-aware webhooks panel + backend id-mention safety net - Per-message stats (time/duration/tokens/model), Markdown+GFM render, code & table copy/download buttons - Vitest suite, end-to-end smoke test (scripts/smoke.mjs), per-session system prompt override - /metrics Prometheus endpoint + /api/metrics JSON, request-id correlation - dotenv with explicit repo-root path; envString/envNumber helpers (handles empty-string env) - Runbooks + SOPs under knowledge/ in English; README, docs, and INDEX.md in English
2026-06-29 16:20:53 +02:00
commit 62728b2200
89 changed files with 11992 additions and 0 deletions
--- a/config/docs.yml
+++ b/config/docs.yml
@@ -0,0 +1,5 @@
+sources:
+  - id: knowledge
+    path: ./knowledge
+    include:
+      - "**/*.md"
--- a/config/mcp.yml
+++ b/config/mcp.yml
@@ -0,0 +1,75 @@
+# MCP (Model Context Protocol) tool catalog.
+#
+# Phase 1: this is a DECLARATIVE catalog of available tools. The SIC backend
+# does NOT call any MCP server directly. The LLM sees these tools in its
+# context (so it can reason about what's possible) and the right panel
+# renders them as recommendations that the user can inspect. Execution is
+# reserved for the future MCP runtime, which will run a per-request MCP
+# client; for now every tool here is "inspect only".
+#
+# Each tool:
+#   id            stable identifier
+#   name          human-readable label
+#   description   what the tool does (sent to the LLM verbatim)
+#   server        optional reference to a configured MCP server (Phase 2+)
+#   parameters    JSON Schema for the tool arguments
+#   tags          free-form tags for filtering
+#   enabled       whether the tool is exposed at all
+mcp_servers: []
+
+mcp_tools:
+  - id: kb.search
+    name: Search knowledge base
+    description: |
+      Search the internal knowledge base (runbooks, SOPs, architecture docs) for
+      documents relevant to the query. Returns matching document ids and
+      relevance scores. Does NOT execute anything.
+    server: null
+    parameters:
+      type: object
+      required: [query]
+      properties:
+        query:
+          type: string
+          description: Natural-language search query.
+        limit:
+          type: integer
+          minimum: 1
+          maximum: 20
+          description: Maximum results to return. Defaults to 5.
+    tags: [search, knowledge, docs]
+    enabled: true
+  - id: sessions.list
+    name: List recent chat sessions
+    description: |
+      List the calling user's most recent chat sessions, ordered by updated_at
+      descending. Does NOT execute anything; returns metadata only.
+    server: null
+    parameters:
+      type: object
+      required: []
+      properties:
+        limit:
+          type: integer
+          minimum: 1
+          maximum: 50
+          description: Maximum sessions to return. Defaults to 10.
+    tags: [sessions, listing]
+    enabled: true
+  - id: webhooks.usage
+    name: Webhook usage stats
+    description: |
+      Return per-webhook usage stats for the calling user over a recent time
+      window (defaults to 7 days). Read-only.
+    server: null
+    parameters:
+      type: object
+      required: []
+      properties:
+        days:
+          type: integer
+          minimum: 1
+          maximum: 90
+          description: Window in days. Defaults to 7.
+    tags: [webhooks, audit, read-only]
+    enabled: true
--- a/config/models.yml
+++ b/config/models.yml
@@ -0,0 +1,29 @@
+models:
+  - id: fast
+    label: MiniMax Fast
+    provider: openai-compatible
+    base_url: ${LLM_BASE_URL}
+    model: MiniMax-M2.7-highspeed
+    max_tokens: 1024
+  - id: balanced
+    label: MiniMax Balanced
+    provider: openai-compatible
+    base_url: ${LLM_BASE_URL}
+    model: MiniMax-M2.7
+    max_tokens: 2048
+    # If MiniMax fails (5xx, 429, timeout, network) fall through to mr-auto.
+    fallback:
+      - mr-auto
+  - id: reasoning
+    label: MiniMax Reasoning
+    provider: openai-compatible
+    base_url: ${LLM_BASE_URL}
+    model: MiniMax-M3
+    max_tokens: 4096
+  - id: mr-auto
+    label: MR Auto (llm.rikrdo.com)
+    provider: openai-compatible
+    base_url: https://llm.rikrdo.com/v1
+    api_key_env: MR_AUTO_API_KEY
+    model: mr-auto
+    max_tokens: 8192
--- a/config/n8n-workflows.yml
+++ b/config/n8n-workflows.yml
@@ -0,0 +1,35 @@
+# n8n-style external workflow links.
+#
+# These are NOT executed by the SIC backend. Each entry is a deep link into
+# an n8n (or similar) workflow runner that the user can open in a new tab.
+# The LLM can recommend them as "external" suggestions; the right panel
+# renders them with a distinct icon and the action only opens a new tab.
+#
+# Fields:
+#   id            stable identifier (kebab-case recommended)
+#   label         human-readable label
+#   description   what the workflow does
+#   url           absolute external URL (env vars supported)
+#   required_roles user roles allowed to see this link (optional, [] = any)
+#   tags          free-form tags to help retrieval / filtering
+n8n_workflows:
+  - id: n8n-vpn-restart
+    label: Restart VPN tunnel (n8n)
+    description: Opens the n8n workflow that restarts the VPN tunnel after credential rotation.
+    url: ${N8N_BASE_URL:?set N8N_BASE_URL}/workflow/vpn-restart
+    required_roles:
+      - webhook-runner
+      - sre
+    tags: [vpn, network, automation]
+  - id: n8n-incident-summary
+    label: Generate incident summary (n8n)
+    description: Opens the n8n workflow that drafts an incident summary from the current session.
+    url: ${N8N_BASE_URL:?set N8N_BASE_URL}/workflow/incident-summary
+    required_roles: []
+    tags: [incident, automation, reporting]
+  - id: n8n-postmortem
+    label: Open postmortem workflow (n8n)
+    description: Opens the postmortem workflow in n8n with the current session context.
+    url: ${N8N_BASE_URL:?set N8N_BASE_URL}/workflow/postmortem
+    required_roles: []
+    tags: [postmortem, automation]
--- a/config/rag.yml
+++ b/config/rag.yml
@@ -0,0 +1,42 @@
+# Retrieval-Augmented Generation (RAG) configuration.
+#
+# SIC treats the knowledge base as an external service. The RAG service is
+# expected to expose:
+#   POST {endpoint}/search
+#     body: { query, limit, min_relevance, include_tags, exclude_tags }
+#     returns: { items: [{ id, title, source, tags, relevance, excerpt, content? }] }
+#   GET  {endpoint}/docs/:id
+#     returns: { id, title, source, tags, owner?, updated?, headings, content }
+#
+# For local dev (or when no endpoint is configured) the docs repository
+# falls back to reading Markdown files from `knowledge/` and applying the
+# token-overlap scoring in apps/api/src/docs/repository.ts.
+#
+# Fields:
+#   endpoint           external RAG service base URL (no trailing slash).
+#                      Leave empty to use the local fallback.
+#   auth_token         optional bearer token sent in the Authorization header.
+#   timeout_ms         HTTP request timeout. Default: 10000.
+#   fallback_to_local when true (default), use the local knowledge/ directory
+#                      if the external endpoint fails. Set to false to fail
+#                      closed.
+#   chunk_strategy     how to split a Markdown doc into chunks (local mode only)
+#     - "heading"  : split on H1/H2/H3, each chunk is a section
+#     - "paragraph": split on blank lines, each chunk is a paragraph block
+#     - "fixed"    : split on a fixed character length (chunk_size_chars)
+#   chunk_size_chars   only used by "fixed" strategy (local mode only)
+#   top_k              max chunks returned per query
+#   min_relevance      chunks with relevance below this are dropped
+#   include_tags       optional global include filter
+#   exclude_tags       optional global exclude filter
+rag:
+  endpoint: ${RAG_ENDPOINT_URL:}
+  auth_token: ${RAG_AUTH_TOKEN:}
+  timeout_ms: 10000
+  fallback_to_local: true
+  chunk_strategy: heading
+  chunk_size_chars: 1500
+  top_k: 5
+  min_relevance: 0.0
+  include_tags: []
+  exclude_tags: []
--- a/config/skills.yml
+++ b/config/skills.yml
@@ -0,0 +1,43 @@
+# Skills are prompt fragments injected into the LLM's system prompt.
+# They are persona/behavior customizations, NOT capabilities: the model still
+# only recommends actions and the backend executes them.
+#
+# Fields:
+#   id          stable identifier (kebab-case recommended)
+#   name        human-readable label
+#   description what the skill does (safe to expose via /api/skills)
+#   enabled     whether the fragment is injected (true/false)
+#   prompt      the system prompt fragment to inject
+#
+# All skills are loaded at API boot. Restart the API after editing this file.
+skills:
+  - id: sre-on-call
+    name: SRE on-call mode
+    description: Respond as a senior SRE handling a production page.
+    enabled: true
+    prompt: |
+      You are responding as a senior SRE who is on-call. Be concise and operational.
+      Always reference the runbook ids from internal_docs when relevant. Prefer
+      concrete actions over abstract advice. When the user's intent is ambiguous,
+      ask one short clarifying question instead of guessing.
+
+  - id: blameless-postmortem
+    name: Blameless postmortem writer
+    description: Help write blameless postmortems using the standard template.
+    enabled: true
+    prompt: |
+      When the user asks for a postmortem or incident review, follow the
+      postmortem runbook template exactly. Use blameless language: focus on
+      systemic causes and contributing factors, never on individual blame.
+      The output must include: Summary, Timeline, Root cause, What went well,
+      What went wrong, Corrective actions, Lessons learned.
+
+  - id: security-incident
+    name: Security incident response
+    description: Guide containment and coordination for security incidents.
+    enabled: false
+    prompt: |
+      When the user describes a security incident, prioritize containment and
+      evidence preservation before root-cause analysis. Recommend involving the
+      Security IR team and the Communications Lead. Never suggest actions that
+      would destroy forensic evidence.
--- a/config/webhooks.yml
+++ b/config/webhooks.yml
@@ -0,0 +1,81 @@
+webhooks:
+  - id: vpn-diagnostic
+    label: Run VPN diagnostic
+    description: Runs a diagnostic on the VPN server and returns latency and health metrics.
+    method: POST
+    url: ${WEBHOOK_VPN_DIAGNOSTIC_URL}
+    required_roles:
+      - webhook-runner
+    confirmation_required: true
+    payload_template:
+      source: pi-chat
+      user_id: "{{user.id}}"
+      session_id: "{{session.id}}"
+      last_user_message: "{{chat.last_user_message}}"
+  - id: service-restart
+    label: Restart service
+    description: Restarts a system service. The service name is read from the payload.
+    method: POST
+    url: ${WEBHOOK_SERVICE_RESTART_URL}
+    required_roles:
+      - webhook-runner
+      - sre
+    confirmation_required: true
+    payload_template:
+      source: pi-chat
+      user_id: "{{user.id}}"
+      session_id: "{{session.id}}"
+      service: "{{payload.service}}"
+      last_user_message: "{{chat.last_user_message}}"
+  - id: dns-flush
+    label: Flush local DNS
+    description: Clears the local DNS cache and returns the result.
+    method: POST
+    url: ${WEBHOOK_DNS_FLUSH_URL}
+    required_roles:
+      - webhook-runner
+    confirmation_required: true
+    payload_template:
+      source: pi-chat
+      user_id: "{{user.id}}"
+      session_id: "{{session.id}}"
+      last_user_message: "{{chat.last_user_message}}"
+  - id: disk-cleanup
+    label: Clean /tmp
+    description: Removes files in /tmp older than 7 days.
+    method: POST
+    url: ${WEBHOOK_DISK_CLEANUP_URL}
+    required_roles:
+      - webhook-runner
+      - sre
+    confirmation_required: true
+    payload_template:
+      source: pi-chat
+      user_id: "{{user.id}}"
+      session_id: "{{session.id}}"
+      last_user_message: "{{chat.last_user_message}}"
+  - id: log-tail
+    label: Tail service log
+    description: Returns the last N lines of a service log.
+    method: GET
+    url: ${WEBHOOK_LOG_TAIL_URL}
+    required_roles:
+      - webhook-runner
+    confirmation_required: false
+    payload_template:
+      service: "{{payload.service}}"
+      lines: "{{payload.lines}}"
+  - id: cache-purge
+    label: Purge CDN cache
+    description: Invalidates the CDN cache for the provided paths.
+    method: POST
+    url: ${WEBHOOK_CACHE_PURGE_URL}
+    required_roles:
+      - webhook-runner
+    confirmation_required: true
+    payload_template:
+      source: pi-chat
+      user_id: "{{user.id}}"
+      session_id: "{{session.id}}"
+      paths: "{{payload.paths}}"
+      last_user_message: "{{chat.last_user_message}}"