How a question becomes a cited answer

Follow one question — "What is the cap on the firm's liability?" — from the qa agent through local embeddings to a verified [Document § 9] citation in the browser.
dochaus/tool/search-document.ts322 lines · embed L27–34
Outline 18 symbolsMODEL const
DIM const
extractor const
embed function
dot function
CANDIDATES const
RRF_K const
ChunkRow type
vectorChannel function
lexicalChannel function
phraseChannel function
RECENT_LIMIT const
SESSION_LIMIT const
recentBySession const
ATTACH_CAP const
attachDefinitions function
EXCERPT_REF_RE const
attachCrossRefs function
1import { tool } from "@opencode-ai/plugin"
2import { Database } from "bun:sqlite"
3import { existsSync } from "node:fs"
4import path from "node:path"
5import { formatCitations } from "../lib/citations"
6import { pendingRedlinesForDoc } from "../lib/redlines"
7
8// doc.haus retrieval tool. Reads the per-matter legal.db that
9// `services/ingest` populates and runs the query through two channels — the
10// same local embedding model used at ingest time, and the BM25-ranked FTS5
11// index ingest maintains over the same chunks — fused into one citation list.
12//
13// The database lives inside the matter directory (`<matter>/.dochaus/legal.db`)
14// so retrieval is naturally scoped to the active matter and never crosses into
15// another matter's privileged material. This tool is read-only; all writes
16// happen in the ingest service.
17
18// Must stay in lockstep with services/ingest/src/embed.ts (same model id and
19// pooling), or query and chunk vectors stop being comparable. The model card
20// specifies CLS pooling. Queries embed raw; chunks were embedded with a
21// document › section breadcrumb prepended (see embed.ts), which only the
22// document side carries.
23const MODEL = "onnx-community/granite-embedding-small-english-r2-ONNX"
24const DIM = 384
25
26let extractor: any
27async function embed(text: string) {
28  if (!extractor) {
29    const { pipeline } = await import("@huggingface/transformers")
30    extractor = await pipeline("feature-extraction", MODEL, { dtype: "q8" })
31  }
32  const output = await extractor(text, { pooling: "cls", normalize: true })
33  return output.data as Float32Array
34}
35
36function dot(a: Float32Array, b: Float32Array) {
37  let sum = 0
38  for (let i = 0; i < DIM; i++) sum += a[i] * b[i]
39  return sum
40}
41
42// Hybrid retrieval (issue #67): two channels over the same chunks — embedding
43// cosine for meaning, FTS5/BM25 for exact tokens (section numbers, defined
44// terms, party names) — fused with reciprocal-rank fusion. RRF works on ranks
45// alone, so the channels' incomparable score scales never need normalizing.
46const CANDIDATES = 20
47const RRF_K = 60
48
49type ChunkRow = {
50  id: number
51  doc_name: string
52  doc_path: string
53  section: string
54  text: string
55  char_start: number
56  char_end: number
57  flagged: number
58}
59
60function vectorChannel(db: Database, queryVec: Float32Array, document?: string): ChunkRow[] {
61  const sql =
62    "SELECT id, doc_name, doc_path, section, text, char_start, char_end, embedding, flagged FROM chunks" +
63    (document ? " WHERE doc_name = ?" : "")
64  const rows = (document ? db.query(sql).all(document) : db.query(sql).all()) as Array<
65    ChunkRow & { embedding: Uint8Array }
66  >
67  return rows
68    .map((row) => ({
69      row,
70      score: dot(queryVec, new Float32Array(row.embedding.buffer, row.embedding.byteOffset, DIM)),
71    }))
72    .sort((a, b) => b.score - a.score)
73    .slice(0, CANDIDATES)
74    .map((scored) => scored.row)
75}
76
77// FTS5 MATCH has its own query syntax that throws on raw punctuation, so every
78// whitespace token is wrapped in double quotes (a phrase). Quoting also makes
79// dotted identifiers work: unicode61 splits "8.3" into adjacent tokens, and the
80// quoted phrase matches exactly that sequence. Tokens are OR'd — BM25's IDF
81// weighting lets a rare token (a section number, a party name) dominate the
82// ranking while near-stopwords contribute almost nothing.
83function lexicalChannel(db: Database, query: string, document?: string): ChunkRow[] {
84  const match = query
85    .split(/\s+/)
86    .filter((token) => /[\p{L}\p{N}]/u.test(token))
87    .map((token) => `"${token.replaceAll('"', '""')}"`)
88    .join(" OR ")
89  if (!match) return []
90  const sql =
91    "SELECT c.id, c.doc_name, c.doc_path, c.section, c.text, c.char_start, c.char_end, c.flagged" +
92    " FROM chunks_fts f JOIN chunks c ON c.id = f.rowid WHERE chunks_fts MATCH ?" +
93    (document ? " AND c.doc_name = ?" : "") +
94    // bm25() is best-first ascending; weight the section label and document name
95    // above body text so a query naming a clause or a document ranks that
96    // clause's or document's own chunks before passing mentions.
97    " ORDER BY bm25(chunks_fts, 1.0, 2.0, 2.0) LIMIT ?"
98  const params = document ? [match, document, CANDIDATES] : [match, CANDIDATES]
99  return db.query(sql).all(...params) as ChunkRow[]
100}
101
102// Third channel: the whole query as one FTS5 phrase. It only matches chunks
103// containing the query's tokens as a literal sequence — an exact section
104// reference, defined term, or party name — and is empty for paraphrased
105// semantic queries. Without it, a chunk that uniquely contains the full literal
106// can be outscored in fusion by chunks the fuzzy channels both like; the extra
107// rank contribution here keeps the literal hit on top, which is the point of
108// hybrid retrieval. Single-token queries are already covered by the OR channel,
109// and a phrase needs two tokens to add ordering signal.
110function phraseChannel(db: Database, query: string, document?: string): ChunkRow[] {
111  const tokens = query.split(/\s+/).filter((token) => /[\p{L}\p{N}]/u.test(token))
112  if (tokens.length < 2) return []
113  const phrase = `"${tokens.map((token) => token.replaceAll('"', '""')).join(" ")}"`
114  const sql =
115    "SELECT c.id, c.doc_name, c.doc_path, c.section, c.text, c.char_start, c.char_end, c.flagged" +
116    " FROM chunks_fts f JOIN chunks c ON c.id = f.rowid WHERE chunks_fts MATCH ?" +
117    (document ? " AND c.doc_name = ?" : "") +
118    " ORDER BY bm25(chunks_fts, 1.0, 2.0, 2.0) LIMIT ?"
119  const params = document ? [phrase, document, CANDIDATES] : [phrase, CANDIDATES]
120  return db.query(sql).all(...params) as ChunkRow[]
121}
122
123// Per-session memory of the most recent result-sets. A model in a search loop
124// keeps rephrasing the query but gets back the *same* passages every time (e.g.
125// hunting a section that does not exist). When an incoming result-set matches
126// one already returned in the last few calls, we short-circuit and steer the
127// model to answer from what it has rather than searching again. Upstream's V2
128// runner does not yet bound repeated identical tool calls (see runner/llm.ts),
129// so the bound lives here at the tool boundary.
130const RECENT_LIMIT = 5
131// Cap how many sessions we keep loop-detection state for, so a long-lived
132// server process does not accumulate one entry per session forever.
133const SESSION_LIMIT = 256
134const recentBySession = new Map<string, string[]>()
135
136export default tool({
137  description:
138    "Search the current matter's documents for passages relevant to a query and return them as citations. Use this before answering any question about a document.",
139  args: {
140    query: tool.schema.string().describe("What to search for, in natural language"),
141    k: tool.schema.number().int().min(1).max(20).optional().describe("Number of passages to return (default 5)"),
142    document: tool.schema
143      .string()
144      .optional()
145      .describe("Restrict the search to a single document by its exact name. Omit to search the whole matter."),
146  },
147  async execute(args, ctx) {
148    const dbPath = path.join(ctx.directory, ".dochaus", "legal.db")
149    if (!existsSync(dbPath)) {
150      return "No documents have been indexed for this matter yet."
151    }
152
153    const queryVec = await embed(args.query)
154    const k = args.k ?? 5
155
156    const db = new Database(dbPath, { readonly: true })
157    // Vectors from a different embedding model are not comparable with this
158    // tool's query vectors — rankings would be silently wrong. Refuse instead.
159    const hasMeta = db.query("SELECT 1 FROM sqlite_master WHERE name = 'meta'").get()
160    const indexModel = hasMeta
161      ? (db.query("SELECT value FROM meta WHERE key = 'embedding_model'").get() as { value: string } | null)?.value
162      : null
163    if (indexModel !== MODEL) {
164      db.close()
165      return "This matter's search index was built by an older embedding model. Restart the ingest service to migrate it, then search again."
166    }
167    const channels = [
168      vectorChannel(db, queryVec, args.document),
169      lexicalChannel(db, args.query, args.document),
170      phraseChannel(db, args.query, args.document),
171    ]
172
173    // score = Σ 1/(RRF_K + rank) over the channels a chunk appears in, divided
174    // by the best possible sum (rank 1 in every channel) to cap it at 1. A hit
175    // from a single channel therefore tops out near 1/3 — the scale ranks
176    // results against each other, it is not a calibrated relevance probability.
177    const fused = new Map<number, { row: ChunkRow; score: number }>()
178    for (const channel of channels)
179      channel.forEach((row, i) => {
180        const entry = fused.get(row.id) ?? { row, score: 0 }
181        entry.score += 1 / (RRF_K + 1 + i)
182        fused.set(row.id, entry)
183      })
184    const ranked = [...fused.values()]
185      .sort((a, b) => b.score - a.score)
186      .slice(0, k)
187      .map(({ row, score }) => ({ row, score: score / (channels.length / (RRF_K + 1)) }))
188
189    const definitionsNote = attachDefinitions(db, ranked)
190    const crossRefsNote = attachCrossRefs(db, ranked)
191    db.close()
192
193    const signature = ranked.map(({ row }) => `${row.doc_name}§${row.section}`).join("|")
194    const recent = recentBySession.get(ctx.sessionID) ?? []
195    if (recent.includes(signature)) {
196      console.warn(
197        `[search-document] repeated result-set in session ${ctx.sessionID} for query "${args.query}" — short-circuiting to break a search loop`,
198      )
199      return "These passages were already returned by an earlier search this turn — the same results matched again, so searching further will not surface anything new. Answer from the passages you have already retrieved; if they do not address the question, say the documents do not cover it. Do not repeat this search."
200    }
201    if (!recentBySession.has(ctx.sessionID) && recentBySession.size >= SESSION_LIMIT) {
202      const oldest = recentBySession.keys().next().value
203      if (oldest !== undefined) recentBySession.delete(oldest)
204    }
205    recentBySession.set(ctx.sessionID, [signature, ...recent].slice(0, RECENT_LIMIT))
206
207    const citations = ranked.map(({ row, score }) => ({
208      documentName: row.doc_name,
209      docPath: row.doc_path,
210      section: row.section,
211      excerpt: row.text,
212      charStart: row.char_start,
213      charEnd: row.char_end,
214      score,
215      // Carried through to formatCitations, which warns the model before the
216      // excerpt that ingest flagged this passage as instruction-like (issue #17).
217      flagged: row.flagged === 1,
218    }))
219
220    // Surface proposals already pending on the cited documents. The retrieved
221    // passages reflect the clean (accepted) document on disk — they do NOT include
222    // edits the assistant proposed earlier this negotiation but that are not yet
223    // accepted. Listing them keeps the model from re-proposing or contradicting a
224    // change it already made, and lets it compose new edits against the running state.
225    const pending = [...new Set(citations.map((c) => c.docPath))].flatMap((docPath) =>
226      pendingRedlinesForDoc(ctx.directory, docPath).map((r) => ({ docName: path.basename(docPath), ...r })),
227    )
228    const pendingNote = pending.length
229      ? `\n\nPending redlines on these documents (proposed but not yet accepted — not reflected in the passages above):\n` +
230        pending.map((r) => `- #${r.id} (${r.author}) in ${r.docName}: proposes "${r.new_text}"`).join("\n")
231      : ""
232
233    return {
234      title: `${citations.length} passage(s) for "${args.query}"`,
235      output: (formatCitations(citations) || "No relevant passages found.") + definitionsNote + crossRefsNote + pendingNote,
236      metadata: { citations, pending },
237    }
238  },
239})
240
241// Auto-attach the verbatim definitions of defined terms that appear inside the
242// returned excerpts. A passage like "during the Cure Period" silently depends
243// on a definition that lives pages away; attaching it saves the model a lookup
244// and keeps it from guessing the meaning. Matching is case-sensitive whole-word
245// (defined terms are capitalized — case folding would match ordinary prose),
246// definition text is verbatim from the defined_terms table, and the list is
247// deduped and capped so a definition-dense passage cannot flood the response.
248const ATTACH_CAP = 5
249
250function attachDefinitions(db: Database, ranked: Array<{ row: ChunkRow }>) {
251  const docPaths = [...new Set(ranked.map(({ row }) => row.doc_path))]
252  if (!docPaths.length) return ""
253  const terms = db
254    .query(`SELECT doc_path, doc_name, term, definition FROM defined_terms WHERE doc_path IN (${docPaths.map(() => "?").join(",")})`)
255    .all(...docPaths) as Array<{ doc_path: string; doc_name: string; term: string; definition: string }>
256  const attached = terms
257    .filter((t) =>
258      ranked.some(({ row }) => {
259        if (row.doc_path !== t.doc_path) return false
260        // Skip excerpts that are themselves the definition site — attaching the
261        // definition under the passage that states it is noise.
262        if (row.text.includes(`"${t.term}"`) || row.text.includes(`“${t.term}”`)) return false
263        const at = row.text.indexOf(t.term)
264        if (at === -1) return false
265        const before = row.text[at - 1]
266        const after = row.text[at + t.term.length]
267        return (before === undefined || !/[A-Za-z0-9]/.test(before)) && (after === undefined || !/[A-Za-z0-9]/.test(after))
268      }),
269    )
270    .filter((t, i, all) => all.findIndex((o) => o.term === t.term && o.doc_path === t.doc_path) === i)
271    // Longer terms first: "Original Credit Agreement" beats "Credit Agreement"
272    // for the cap when both match the same excerpt.
273    .sort((a, b) => b.term.length - a.term.length)
274    .slice(0, ATTACH_CAP)
275  if (!attached.length) return ""
276  return (
277    `\n\nDefined terms used in these passages (definitions verbatim from the documents):\n` +
278    attached.map((t) => `- "${t.term}" (${t.doc_name}): ${t.definition}`).join("\n")
279  )
280}
281
282// Auto-resolve numbered cross-references appearing in the excerpts ("as set
283// forth in Section 8.3") against the citing document's own section labels, so
284// the model knows the referenced section is one get-section call away instead
285// of searching for it — or that it is not in the indexed text at all.
286const EXCERPT_REF_RE =
287  /\b(Section|Article|Clause|Exhibit|Schedule|Annex|Appendix)\s+(\d+(?:\.\d+)*(?:\([a-z]+\))*|[A-Z](?![A-Za-z])|[IVXLC]+(?![A-Za-z]))/g
288
289function attachCrossRefs(db: Database, ranked: Array<{ row: ChunkRow }>) {
290  const resolved = ranked
291    .flatMap(({ row }) =>
292      [...row.text.matchAll(EXCERPT_REF_RE)].map((m) => ({
293        docPath: row.doc_path,
294        docName: row.doc_name,
295        ownSection: row.section,
296        kind: m[1],
297        label: m[2],
298      })),
299    )
300    .filter((ref) => ref.label !== ref.ownSection)
301    .filter(
302      (ref, i, all) => all.findIndex((o) => o.docPath === ref.docPath && o.kind === ref.kind && o.label === ref.label) === i,
303    )
304    .map((ref) => ({
305      ...ref,
306      // EXCERPT_REF_RE labels cannot contain %/_ today; escape anyway so a
307      // widened ref pattern can never turn the label into a LIKE wildcard.
308      hit: db
309        .query(
310          "SELECT section FROM chunks WHERE doc_path = ?1 AND (section = ?2 OR section LIKE ?3 || '.%' ESCAPE '\\') LIMIT 1",
311        )
312        .get(ref.docPath, ref.label, ref.label.replaceAll(/[\\%_]/g, "\\$&")) as { section: string } | null,
313    }))
314    .filter((ref) => ref.hit)
315    .slice(0, ATTACH_CAP)
316  if (!resolved.length) return ""
317  return (
318    `\n\nCross-references in these passages that resolve to indexed sections (fetch with the get-section tool):\n` +
319    resolved.map((ref) => `- ${ref.kind} ${ref.label} → section "${ref.hit!.section}" of ${ref.docName}`).join("\n")
320  )
321}
322
No results