How a question becomes a cited answer
Follow one question — "What is the cap on the firm's liability?" — from the qa agent through local embeddings to a verified [Document § 9] citation in the browser.
dochaus/plugin/legal.ts79 lines · LegalPlugin L50–78
Outline 5 symbols
- CACHE_LIMIT const
- textCache const
- indexedText function
- extract function
- LegalPlugin function export
1import type { Plugin } from "@opencode-ai/plugin"
2import { existsSync, statSync } from "node:fs"
3import { formatCitations, type DocumentCitation } from "../lib/citations"
4
5// doc.haus legal plugin — citation verification (issue #6).
6//
7// After `search-document` returns, every citation's span is re-checked against
8// the live source file: the document is re-extracted with the same extractors
9// ingest uses (mammoth for DOCX, unpdf for PDF) and the cited span must
10// reproduce the excerpt exactly. A mismatch means the retrieval index is stale
11// — the file changed after it was indexed — so the citation is rejected before
12// the model or the user ever sees it, and the model is told the document needs
13// re-ingesting. A lawyer must never be handed a quote whose span no longer
14// exists in the document.
15
16const CACHE_LIMIT = 64
17// Extraction is per-document, not per-citation: cache by path, invalidate on
18// mtime so an accepted redline or re-upload is picked up immediately.
19const textCache = new Map<string, { mtimeMs: number; text: string }>()
20
21async function indexedText(docPath: string) {
22 const mtimeMs = statSync(docPath).mtimeMs
23 const hit = textCache.get(docPath)
24 if (hit && hit.mtimeMs === mtimeMs) return hit.text
25 const buffer = Buffer.from(await Bun.file(docPath).arrayBuffer())
26 // Ingest computes offsets by accumulating `line + "\n"` over the extracted
27 // text, which equals the raw extraction plus exactly one trailing newline —
28 // mirror that here or the final chunk of every document fails verification.
29 const text = (await extract(docPath, buffer)) + "\n"
30 if (!textCache.has(docPath) && textCache.size >= CACHE_LIMIT) {
31 const oldest = textCache.keys().next().value
32 if (oldest !== undefined) textCache.delete(oldest)
33 }
34 textCache.set(docPath, { mtimeMs, text })
35 return text
36}
37
38// Must stay in lockstep with extractDocumentText in services/ingest/src/ingest.ts:
39// same extractors, same options, or offsets stop lining up.
40async function extract(docPath: string, buffer: Buffer) {
41 if (docPath.toLowerCase().endsWith(".pdf")) {
42 const { extractText, getDocumentProxy } = await import("unpdf")
43 const { text } = await extractText(await getDocumentProxy(new Uint8Array(buffer)), { mergePages: true })
44 return text
45 }
46 const { default: mammoth } = await import("mammoth")
47 return (await mammoth.extractRawText({ buffer })).value
48}
49
50export const LegalPlugin: Plugin = async () => ({
51 "tool.execute.after": async (input, output) => {
52 if (input.tool !== "search-document") return
53 const citations = output.metadata?.citations as DocumentCitation[] | undefined
54 if (!citations?.length) return
55
56 const verified: DocumentCitation[] = []
57 const rejected: DocumentCitation[] = []
58 for (const citation of citations) {
59 const live =
60 existsSync(citation.docPath) &&
61 (await indexedText(citation.docPath)).slice(citation.charStart, citation.charEnd) === citation.excerpt
62 if (live) verified.push({ ...citation, verified: true })
63 else rejected.push(citation)
64 }
65
66 output.metadata.citations = verified
67 if (!rejected.length) return
68
69 const staleDocs = [...new Set(rejected.map((c) => c.documentName))].join(", ")
70 output.title = `${verified.length} verified passage(s) (${rejected.length} rejected as stale)`
71 output.output =
72 (formatCitations(verified) || "No passages survived verification.") +
73 `\n\n[citation-verification] ${rejected.length} citation(s) from ${staleDocs} were rejected: ` +
74 `their cited spans no longer match the live document, so the index is stale. ` +
75 `Do not quote or rely on the rejected passages. Tell the user that ${staleDocs} ` +
76 `changed after indexing and must be re-uploaded (re-ingested) before its contents can be cited.`
77 },
78})
79