The Atlas doc.haus documentation, bound to its code
108 documents

How a question becomes a cited answer

Follow one question — "What is the cap on the firm's liability?" — from the qa agent through local embeddings to a verified [Document § 9] citation in the browser.

dochaus/tool/search-document.ts137 lines · embed L21–28
Outline 8 symbols
1import { tool } from "@opencode-ai/plugin"
2import { Database } from "bun:sqlite"
3import { existsSync } from "node:fs"
4import path from "node:path"
5import { formatCitations } from "../lib/citations"
6import { pendingRedlinesForDoc } from "../lib/redlines"
7
8// doc.haus retrieval tool. Reads the per-matter legal.db that
9// `services/ingest` populates, embeds the query with the same local MiniLM model
10// used at ingest time, and returns the closest document chunks as citations.
11//
12// The database lives inside the matter directory (`<matter>/.dochaus/legal.db`)
13// so retrieval is naturally scoped to the active matter and never crosses into
14// another matter's privileged material. This tool is read-only; all writes
15// happen in the ingest service.
16
17const MODEL = "Xenova/all-MiniLM-L6-v2"
18const DIM = 384
19
20let extractor: any
21async function embed(text: string) {
22 if (!extractor) {
23 const { pipeline } = await import("@xenova/transformers")
24 extractor = await pipeline("feature-extraction", MODEL)
25 }
26 const output = await extractor(text, { pooling: "mean", normalize: true })
27 return output.data as Float32Array
28}
29
30function dot(a: Float32Array, b: Float32Array) {
31 let sum = 0
32 for (let i = 0; i < DIM; i++) sum += a[i] * b[i]
33 return sum
34}
35
36// Per-session memory of the most recent result-sets. A model in a search loop
37// keeps rephrasing the query but gets back the *same* passages every time (e.g.
38// hunting a section that does not exist). When an incoming result-set matches
39// one already returned in the last few calls, we short-circuit and steer the
40// model to answer from what it has rather than searching again. Upstream's V2
41// runner does not yet bound repeated identical tool calls (see runner/llm.ts),
42// so the bound lives here at the tool boundary.
43const RECENT_LIMIT = 5
44// Cap how many sessions we keep loop-detection state for, so a long-lived
45// server process does not accumulate one entry per session forever.
46const SESSION_LIMIT = 256
47const recentBySession = new Map<string, string[]>()
48
49export default tool({
50 description:
51 "Search the current matter's documents for passages relevant to a query and return them as citations. Use this before answering any question about a document.",
52 args: {
53 query: tool.schema.string().describe("What to search for, in natural language"),
54 k: tool.schema.number().int().min(1).max(20).optional().describe("Number of passages to return (default 5)"),
55 document: tool.schema
56 .string()
57 .optional()
58 .describe("Restrict the search to a single document by its exact name. Omit to search the whole matter."),
59 },
60 async execute(args, ctx) {
61 const dbPath = path.join(ctx.directory, ".dochaus", "legal.db")
62 if (!existsSync(dbPath)) {
63 return "No documents have been indexed for this matter yet."
64 }
65
66 const queryVec = await embed(args.query)
67 const k = args.k ?? 5
68
69 const db = new Database(dbPath, { readonly: true })
70 const sql =
71 "SELECT doc_name, doc_path, section, text, char_start, char_end, embedding FROM chunks" +
72 (args.document ? " WHERE doc_name = ?" : "")
73 const rows = (args.document ? db.query(sql).all(args.document) : db.query(sql).all()) as Array<{
74 doc_name: string
75 doc_path: string
76 section: string
77 text: string
78 char_start: number
79 char_end: number
80 embedding: Uint8Array
81 }>
82 db.close()
83
84 const ranked = rows
85 .map((row) => {
86 const buf = row.embedding
87 const vec = new Float32Array(buf.buffer, buf.byteOffset, DIM)
88 return { row, score: dot(queryVec, vec) }
89 })
90 .sort((a, b) => b.score - a.score)
91 .slice(0, k)
92
93 const signature = ranked.map(({ row }) => `${row.doc_name}§${row.section}`).join("|")
94 const recent = recentBySession.get(ctx.sessionID) ?? []
95 if (recent.includes(signature)) {
96 console.warn(
97 `[search-document] repeated result-set in session ${ctx.sessionID} for query "${args.query}" — short-circuiting to break a search loop`,
98 )
99 return "These passages were already returned by an earlier search this turn — the same results matched again, so searching further will not surface anything new. Answer from the passages you have already retrieved; if they do not address the question, say the documents do not cover it. Do not repeat this search."
100 }
101 if (!recentBySession.has(ctx.sessionID) && recentBySession.size >= SESSION_LIMIT) {
102 const oldest = recentBySession.keys().next().value
103 if (oldest !== undefined) recentBySession.delete(oldest)
104 }
105 recentBySession.set(ctx.sessionID, [signature, ...recent].slice(0, RECENT_LIMIT))
106
107 const citations = ranked.map(({ row, score }) => ({
108 documentName: row.doc_name,
109 docPath: row.doc_path,
110 section: row.section,
111 excerpt: row.text,
112 charStart: row.char_start,
113 charEnd: row.char_end,
114 score,
115 }))
116
117 // Surface proposals already pending on the cited documents. The retrieved
118 // passages reflect the clean (accepted) document on disk — they do NOT include
119 // edits the assistant proposed earlier this negotiation but that are not yet
120 // accepted. Listing them keeps the model from re-proposing or contradicting a
121 // change it already made, and lets it compose new edits against the running state.
122 const pending = [...new Set(citations.map((c) => c.docPath))].flatMap((docPath) =>
123 pendingRedlinesForDoc(ctx.directory, docPath).map((r) => ({ docName: path.basename(docPath), ...r })),
124 )
125 const pendingNote = pending.length
126 ? `\n\nPending redlines on these documents (proposed but not yet accepted — not reflected in the passages above):\n` +
127 pending.map((r) => `- #${r.id} (${r.author}) in ${r.docName}: proposes "${r.new_text}"`).join("\n")
128 : ""
129
130 return {
131 title: `${citations.length} passage(s) for "${args.query}"`,
132 output: (formatCitations(citations) || "No relevant passages found.") + pendingNote,
133 metadata: { citations, pending },
134 }
135 },
136})
137