How a question becomes a cited answer

Follow one question — "What is the cap on the firm's liability?" — from the qa agent through local embeddings to a verified [Document § 9] citation in the browser.

services/ingest/src/ingest.ts140 lines · ingestDocument L61–125
Outline 9 symbolsCHUNK_CHARS const
CLAUSE_RE const
HEADING_RE const
Section type
sectionize function export
chunkSection function
ingestDocument function export
extractDocumentText function export
extractRawText function
1import { writeFileSync } from "node:fs"
2import path from "node:path"
3import mammoth from "mammoth"
4import { openDb, upsertDocument, insertChunk } from "./db"
5import { embedChunk, migrateEmbeddings } from "./embed"
6import { extractStructure, migrateStructure } from "./structure"
7import { pdfToText } from "./pdf"
8import { detectInjection, normalizeExtractedText, scanDocxHiddenContent } from "./sanitize"
9
10// ~500 tokens at roughly 4 chars/token.
11const CHUNK_CHARS = 2000
12
13// A clause line like "7.2 Termination" or "12. Governing Law".
14const CLAUSE_RE = /^\s*(\d+(?:\.\d+)*)[.)]?\s+(.+)$/
15// A heading line: short, mostly uppercase, no trailing sentence punctuation.
16const HEADING_RE = /^[A-Z0-9][A-Z0-9 ,'\-&/]{2,79}$/
17
18type Section = { label: string; text: string; charStart: number; charEnd: number }
19
20export function sectionize(text: string): Section[] {
21  const sections: Section[] = []
22  let current: Section = { label: "Preamble", text: "", charStart: 0, charEnd: 0 }
23  let offset = 0
24
25  for (const line of text.split("\n")) {
26    // markitdown emits Markdown headings; strip the hashes so "## 7.2 Termination"
27    // sections the same as a plain "7.2 Termination" line.
28    const trimmed = line.trim().replace(/^#{1,6}\s+/, "")
29    const isMdHeading = trimmed !== line.trim()
30    const clause = trimmed.match(CLAUSE_RE)
31    const isHeading = clause || isMdHeading || (trimmed.length > 0 && HEADING_RE.test(trimmed))
32    if (isHeading && trimmed.length > 0) {
33      if (current.text.trim()) {
34        current.charEnd = offset
35        sections.push(current)
36      }
37      const label = clause?.[1] ?? trimmed
38      current = { label, text: line + "\n", charStart: offset, charEnd: 0 }
39    } else {
40      current.text += line + "\n"
41    }
42    offset += line.length + 1
43  }
44  if (current.text.trim()) {
45    current.charEnd = offset
46    sections.push(current)
47  }
48  return sections
49}
50
51function chunkSection(section: Section) {
52  const chunks: { text: string; charStart: number; charEnd: number }[] = []
53  for (let i = 0; i < section.text.length; i += CHUNK_CHARS) {
54    const slice = section.text.slice(i, i + CHUNK_CHARS)
55    if (!slice.trim()) continue
56    chunks.push({ text: slice, charStart: section.charStart + i, charEnd: section.charStart + i + slice.length })
57  }
58  return chunks
59}
60
61export async function ingestDocument(matterDir: string, fileName: string, buffer: Buffer) {
62  const docPath = path.join(matterDir, fileName)
63  writeFileSync(docPath, buffer)
64
65  // Untrusted-document defense (issue #17): normalize away invisible Unicode, then
66  // scan the visible text and the raw DOCX for injected instructions. Findings are
67  // stored on the document and flag overlapping chunks — the text itself is never
68  // rewritten, so the lawyer always reviews exactly what the counterparty wrote.
69  const extraction = normalizeExtractedText(await extractRawText(fileName, buffer))
70  const text = extraction.text
71  const findings = [
72    ...extraction.findings,
73    ...detectInjection(text),
74    ...(fileName.toLowerCase().endsWith(".docx") ? scanDocxHiddenContent(buffer) : []),
75  ]
76  const sections = sectionize(text)
77
78  const db = openDb(matterDir)
79  // Never mix vectors from two embedding models in one matter — re-embed any
80  // stale chunks before this document's are written (no-op on current DBs).
81  // Same for structure rows extracted by an older pattern version.
82  await migrateEmbeddings(db)
83  migrateStructure(db)
84  const documentId = upsertDocument(
85    db,
86    docPath,
87    fileName,
88    Date.now(),
89    findings.length ? JSON.stringify({ findings }) : null,
90  )
91
92  let chunkIndex = 0
93  let flaggedChunks = 0
94  for (const section of sections) {
95    for (const chunk of chunkSection(section)) {
96      const embedding = await embedChunk(fileName, section.label, chunk.text)
97      const flagged = findings.some(
98        (f) => f.charStart !== undefined && f.charEnd !== undefined && f.charStart < chunk.charEnd && f.charEnd > chunk.charStart,
99      )
100      if (flagged) flaggedChunks++
101      insertChunk(db, {
102        documentId,
103        docPath,
104        docName: fileName,
105        section: section.label,
106        chunkIndex: chunkIndex++,
107        text: chunk.text,
108        charStart: chunk.charStart,
109        charEnd: chunk.charEnd,
110        embedding,
111        flagged,
112      })
113    }
114  }
115  extractStructure(db, docPath, fileName, text)
116  db.close()
117
118  return {
119    name: fileName,
120    docPath,
121    sections: sections.length,
122    chunks: chunkIndex,
123    injection: findings.length ? { findings, flaggedChunks } : null,
124  }
125}
126
127// Pull plain text from a source document. DOCX goes through mammoth; PDF through
128// markitdown/unpdf with an OCR fallback for flat scans (see pdf.ts). Always
129// normalized (see sanitize.ts) so every consumer — indexing, the document/template
130// text routes, and through them the agents' read-document tool — sees the same
131// instruction-stripped text that chunk offsets were computed over.
132export async function extractDocumentText(fileName: string, buffer: Buffer) {
133  return normalizeExtractedText(await extractRawText(fileName, buffer)).text
134}
135
136async function extractRawText(fileName: string, buffer: Buffer) {
137  if (fileName.toLowerCase().endsWith(".pdf")) return pdfToText(buffer)
138  return (await mammoth.extractRawText({ buffer })).value
139}
140

No results