How a question becomes a cited answer
Follow one question — "What is the cap on the firm's liability?" — from the qa agent through local embeddings to a verified [Document § 9] citation in the browser.
services/ingest/src/ingest.ts98 lines · ingestDocument L55–85
Outline 8 symbols
- CHUNK_CHARS const
- CLAUSE_RE const
- HEADING_RE const
- Section type
- sectionize function
- chunkSection function
- ingestDocument function export
- extractDocumentText function
1import { writeFileSync } from "node:fs"
2import path from "node:path"
3import mammoth from "mammoth"
4import { openDb, upsertDocument, insertChunk } from "./db"
5import { embed } from "./embed"
6
7// ~500 tokens at roughly 4 chars/token.
8const CHUNK_CHARS = 2000
9
10// A clause line like "7.2 Termination" or "12. Governing Law".
11const CLAUSE_RE = /^\s*(\d+(?:\.\d+)*)[.)]?\s+(.+)$/
12// A heading line: short, mostly uppercase, no trailing sentence punctuation.
13const HEADING_RE = /^[A-Z0-9][A-Z0-9 ,'\-&/]{2,79}$/
14
15type Section = { label: string; text: string; charStart: number; charEnd: number }
16
17function sectionize(text: string): Section[] {
18 const sections: Section[] = []
19 let current: Section = { label: "Preamble", text: "", charStart: 0, charEnd: 0 }
20 let offset = 0
21
22 for (const line of text.split("\n")) {
23 const trimmed = line.trim()
24 const clause = trimmed.match(CLAUSE_RE)
25 const isHeading = clause || (trimmed.length > 0 && HEADING_RE.test(trimmed))
26 if (isHeading) {
27 if (current.text.trim()) {
28 current.charEnd = offset
29 sections.push(current)
30 }
31 const label = clause ? clause[1] : trimmed
32 current = { label, text: line + "\n", charStart: offset }
33 } else {
34 current.text += line + "\n"
35 }
36 offset += line.length + 1
37 }
38 if (current.text.trim()) {
39 current.charEnd = offset
40 sections.push(current)
41 }
42 return sections
43}
44
45function chunkSection(section: Section) {
46 const chunks: { text: string; charStart: number; charEnd: number }[] = []
47 for (let i = 0; i < section.text.length; i += CHUNK_CHARS) {
48 const slice = section.text.slice(i, i + CHUNK_CHARS)
49 if (!slice.trim()) continue
50 chunks.push({ text: slice, charStart: section.charStart + i, charEnd: section.charStart + i + slice.length })
51 }
52 return chunks
53}
54
55export async function ingestDocument(matterDir: string, fileName: string, buffer: Buffer) {
56 const docPath = path.join(matterDir, fileName)
57 writeFileSync(docPath, buffer)
58
59 const text = await extractDocumentText(fileName, buffer)
60 const sections = sectionize(text)
61
62 const db = openDb(matterDir)
63 const documentId = upsertDocument(db, docPath, fileName, Date.now())
64
65 let chunkIndex = 0
66 for (const section of sections) {
67 for (const chunk of chunkSection(section)) {
68 const embedding = await embed(chunk.text)
69 insertChunk(db, {
70 documentId,
71 docPath,
72 docName: fileName,
73 section: section.label,
74 chunkIndex: chunkIndex++,
75 text: chunk.text,
76 charStart: chunk.charStart,
77 charEnd: chunk.charEnd,
78 embedding,
79 })
80 }
81 }
82 db.close()
83
84 return { name: fileName, docPath, sections: sections.length, chunks: chunkIndex }
85}
86
87// Pull plain text from a source document for indexing. DOCX goes through mammoth;
88// PDF through unpdf's pdf.js build (merged into one string). Both feed the same
89// sectionize/chunk/embed path, so the rest of ingestion is format-agnostic.
90async function extractDocumentText(fileName: string, buffer: Buffer) {
91 if (fileName.toLowerCase().endsWith(".pdf")) {
92 const { extractText, getDocumentProxy } = await import("unpdf")
93 const { text } = await extractText(await getDocumentProxy(new Uint8Array(buffer)), { mergePages: true })
94 return text
95 }
96 return (await mammoth.extractRawText({ buffer })).value
97}
98