[{"data":1,"prerenderedAt":126},["ShallowReactive",2],{"blog-post-en-\u002Fblog\u002Fdocument-ingestion-checklist":3,"i-mdi:book-search-outline":107,"i-mdi:scale-balance":111,"i-mdi:shield-lock-outline":113,"i-mdi:robot-outline":115,"i-mdi:linkedin":117,"i-mdi:twitter":119,"i-mdi:github":121,"i-circle-flags:lang-en":123},{"id":4,"title":5,"author":6,"body":7,"date":92,"description":93,"draft":94,"extension":95,"image":96,"meta":97,"navigation":98,"path":99,"seo":100,"stem":101,"tags":102,"__hash__":106},"blog_en\u002Fblog\u002Fdocument-ingestion-checklist.md","Document ingestion checklist before you ship RAG","Minerva Data Solutions",{"type":8,"value":9,"toc":82},"minimark",[10,14,17,22,25,28,32,35,38,42,45,48,52,55,58,62,65],[11,12,13],"p",{},"Most failed document AI projects do not fail at the chatbot layer. They fail earlier: messy permissions, duplicate documents, poor OCR, missing ownership, weak metadata, stale files, and no way to prove which version was used.",[11,15,16],{},"Before building RAG, build the ingestion discipline.",[18,19,21],"h3",{"id":20},"_1-classify-the-corpus","1. Classify the corpus",[11,23,24],{},"Start by separating document families: policies, procedures, contracts, board packs, audit evidence, technical manuals, invoices, emails, and scanned PDFs. Each family has different structure, retention rules, owners, and risk.",[11,26,27],{},"Do not treat the whole shared drive as one blob. A contract clause, a policy exception, and a scanned invoice need different parsing and review rules.",[18,29,31],{"id":30},"_2-preserve-source-truth","2. Preserve source truth",[11,33,34],{},"Keep an immutable raw copy. Store extracted text separately. Record the parser version, OCR engine, language, page count, checksum, owner, creation date, modification date, and source location.",[11,36,37],{},"If the extracted text later changes because you improve OCR or parsing, you should still know what source produced the old result.",[18,39,41],{"id":40},"_3-normalize-before-embedding","3. Normalize before embedding",[11,43,44],{},"Clean headers, footers, page numbers, boilerplate, hyphenation, tables, and repeated legal notices. Keep layout signals when they matter: headings, sections, tables, clauses, signatures, and appendices.",[11,46,47],{},"Embedding dirty text creates dirty retrieval. The model cannot compensate for a corpus that was mangled during ingestion.",[18,49,51],{"id":50},"_4-chunk-by-meaning-not-by-arbitrary-size","4. Chunk by meaning, not by arbitrary size",[11,53,54],{},"Chunking should respect document structure. Policies often work by section. Contracts work by clause. Manuals work by procedure. Tables need special handling. A 1,000-token blind split may be simple, but it can cut the exact evidence in half.",[11,56,57],{},"Every chunk should carry enough metadata to explain itself: document ID, version, page, heading path, clause number, language, access scope, and retention policy.",[18,59,61],{"id":60},"_5-validate-before-launch","5. Validate before launch",[11,63,64],{},"Run test questions from real users. Include questions with no answer, old-versus-new document conflicts, access-restricted documents, and ambiguous language. Measure retrieval precision before optimizing the LLM.",[11,66,67,68,75,76,81],{},"Further reading: ",[69,70,74],"a",{"href":71,"rel":72},"https:\u002F\u002Ftowardsdatascience.com\u002Fdocument-intelligence-a-series-on-building-rag-brick-by-brick-from-minimal-to-corpus-scale\u002F",[73],"nofollow","Enterprise document intelligence series"," and ",[69,77,80],{"href":78,"rel":79},"https:\u002F\u002Ffuzzypoint.uk\u002Fenterprise-rag-designing-retrieval-augmented-generation-with",[73],"enterprise RAG provenance patterns",".",{"title":83,"searchDepth":84,"depth":84,"links":85},"",2,[86,88,89,90,91],{"id":20,"depth":87,"text":21},3,{"id":30,"depth":87,"text":31},{"id":40,"depth":87,"text":41},{"id":50,"depth":87,"text":51},{"id":60,"depth":87,"text":61},"2026-05-22","The pre-flight checks that stop bad OCR, wrong versions, and permission leaks from poisoning your retrieval layer.",false,"md","\u002Fimg\u002Fog-image.png",{},true,"\u002Fblog\u002Fdocument-ingestion-checklist",{"title":5,"description":93},"blog\u002Fdocument-ingestion-checklist",[103,104,105],"document AI","ingestion","knowledge management","bI1d8F7qsIS5iFKYHIPDocDl9lRwWgKnnK7ogOYJth8",{"left":108,"top":108,"width":109,"height":109,"rotate":108,"vFlip":94,"hFlip":94,"body":110},0,24,"\u003Cpath fill=\"currentColor\" d=\"M15.5 12c2.5 0 4.5 2 4.5 4.5c0 .88-.25 1.71-.69 2.4l3.08 3.1L21 23.39l-3.12-3.07c-.69.43-1.51.68-2.38.68c-2.5 0-4.5-2-4.5-4.5s2-4.5 4.5-4.5m0 2a2.5 2.5 0 0 0-2.5 2.5a2.5 2.5 0 0 0 2.5 2.5a2.5 2.5 0 0 0 2.5-2.5a2.5 2.5 0 0 0-2.5-2.5M13 4v8l-2.5-2.25L8 12V4H6v16h4c.54.81 1.23 1.5 2.03 2H6a2 2 0 0 1-2-2V4a2 2 0 0 1 2-2h12a2 2 0 0 1 2 2v7.81c-.58-.55-1.25-1-2-1.31V4z\"\u002F>",{"left":108,"top":108,"width":109,"height":109,"rotate":108,"vFlip":94,"hFlip":94,"body":112},"\u003Cpath fill=\"currentColor\" d=\"M12 3c-1.27 0-2.4.8-2.82 2H3v2h1.95L2 14c-.47 2 1 3 3.5 3s4.06-1 3.5-3L6.05 7h3.12c.33.85.98 1.5 1.83 1.83V20H2v2h20v-2h-9V8.82c.85-.32 1.5-.97 1.82-1.82h3.13L15 14c-.47 2 1 3 3.5 3s4.06-1 3.5-3l-2.95-7H21V5h-6.17C14.4 3.8 13.27 3 12 3m0 2a1 1 0 0 1 1 1a1 1 0 0 1-1 1a1 1 0 0 1-1-1a1 1 0 0 1 1-1m-6.5 5.25L7 14H4zm13 0L20 14h-3z\"\u002F>",{"left":108,"top":108,"width":109,"height":109,"rotate":108,"vFlip":94,"hFlip":94,"body":114},"\u003Cpath fill=\"currentColor\" d=\"M21 11c0 5.55-3.84 10.74-9 12c-5.16-1.26-9-6.45-9-12V5l9-4l9 4zm-9 10c3.75-1 7-5.46 7-9.78V6.3l-7-3.12L5 6.3v4.92C5 15.54 8.25 20 12 21m2.8-10V9.5C14.8 8.1 13.4 7 12 7S9.2 8.1 9.2 9.5V11c-.6 0-1.2.6-1.2 1.2v3.5c0 .7.6 1.3 1.2 1.3h5.5c.7 0 1.3-.6 1.3-1.2v-3.5c0-.7-.6-1.3-1.2-1.3m-1.3 0h-3V9.5c0-.8.7-1.3 1.5-1.3s1.5.5 1.5 1.3z\"\u002F>",{"left":108,"top":108,"width":109,"height":109,"rotate":108,"vFlip":94,"hFlip":94,"body":116},"\u003Cpath fill=\"currentColor\" d=\"M17.5 15.5c0 1.11-.89 2-2 2s-2-.89-2-2s.9-2 2-2s2 .9 2 2m-9-2c-1.1 0-2 .9-2 2s.9 2 2 2s2-.89 2-2s-.89-2-2-2M23 15v3c0 .55-.45 1-1 1h-1v1c0 1.11-.89 2-2 2H5a2 2 0 0 1-2-2v-1H2c-.55 0-1-.45-1-1v-3c0-.55.45-1 1-1h1c0-3.87 3.13-7 7-7h1V5.73c-.6-.34-1-.99-1-1.73c0-1.1.9-2 2-2s2 .9 2 2c0 .74-.4 1.39-1 1.73V7h1c3.87 0 7 3.13 7 7h1c.55 0 1 .45 1 1m-2 1h-2v-2c0-2.76-2.24-5-5-5h-4c-2.76 0-5 2.24-5 5v2H3v1h2v3h14v-3h2z\"\u002F>",{"left":108,"top":108,"width":109,"height":109,"rotate":108,"vFlip":94,"hFlip":94,"body":118},"\u003Cpath fill=\"currentColor\" d=\"M19 3a2 2 0 0 1 2 2v14a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V5a2 2 0 0 1 2-2zm-.5 15.5v-5.3a3.26 3.26 0 0 0-3.26-3.26c-.85 0-1.84.52-2.32 1.3v-1.11h-2.79v8.37h2.79v-4.93c0-.77.62-1.4 1.39-1.4a1.4 1.4 0 0 1 1.4 1.4v4.93zM6.88 8.56a1.68 1.68 0 0 0 1.68-1.68c0-.93-.75-1.69-1.68-1.69a1.69 1.69 0 0 0-1.69 1.69c0 .93.76 1.68 1.69 1.68m1.39 9.94v-8.37H5.5v8.37z\"\u002F>",{"left":108,"top":108,"width":109,"height":109,"rotate":108,"vFlip":94,"hFlip":94,"body":120},"\u003Cpath fill=\"currentColor\" d=\"M22.46 6c-.77.35-1.6.58-2.46.69c.88-.53 1.56-1.37 1.88-2.38c-.83.5-1.75.85-2.72 1.05C18.37 4.5 17.26 4 16 4c-2.35 0-4.27 1.92-4.27 4.29c0 .34.04.67.11.98C8.28 9.09 5.11 7.38 3 4.79c-.37.63-.58 1.37-.58 2.15c0 1.49.75 2.81 1.91 3.56c-.71 0-1.37-.2-1.95-.5v.03c0 2.08 1.48 3.82 3.44 4.21a4.2 4.2 0 0 1-1.93.07a4.28 4.28 0 0 0 4 2.98a8.52 8.52 0 0 1-5.33 1.84q-.51 0-1.02-.06C3.44 20.29 5.7 21 8.12 21C16 21 20.33 14.46 20.33 8.79c0-.19 0-.37-.01-.56c.84-.6 1.56-1.36 2.14-2.23\"\u002F>",{"left":108,"top":108,"width":109,"height":109,"rotate":108,"vFlip":94,"hFlip":94,"body":122},"\u003Cpath fill=\"currentColor\" d=\"M12 2A10 10 0 0 0 2 12c0 4.42 2.87 8.17 6.84 9.5c.5.08.66-.23.66-.5v-1.69c-2.77.6-3.36-1.34-3.36-1.34c-.46-1.16-1.11-1.47-1.11-1.47c-.91-.62.07-.6.07-.6c1 .07 1.53 1.03 1.53 1.03c.87 1.52 2.34 1.07 2.91.83c.09-.65.35-1.09.63-1.34c-2.22-.25-4.55-1.11-4.55-4.92c0-1.11.38-2 1.03-2.71c-.1-.25-.45-1.29.1-2.64c0 0 .84-.27 2.75 1.02c.79-.22 1.65-.33 2.5-.33s1.71.11 2.5.33c1.91-1.29 2.75-1.02 2.75-1.02c.55 1.35.2 2.39.1 2.64c.65.71 1.03 1.6 1.03 2.71c0 3.82-2.34 4.66-4.57 4.91c.36.31.69.92.69 1.85V21c0 .27.16.59.67.5C19.14 20.16 22 16.42 22 12A10 10 0 0 0 12 2\"\u002F>",{"left":108,"top":108,"width":124,"height":124,"rotate":108,"vFlip":94,"hFlip":94,"body":125},512,"\u003Cmask id=\"SVGuywqVbel\">\u003Ccircle cx=\"256\" cy=\"256\" r=\"256\" fill=\"#fff\"\u002F>\u003C\u002Fmask>\u003Cg mask=\"url(#SVGuywqVbel)\">\u003Cpath fill=\"#eee\" d=\"m0 0l8 22l-8 23v23l32 54l-32 54v32l32 48l-32 48v32l32 54l-32 54v68l22-8l23 8h23l54-32l54 32h32l48-32l48 32h32l54-32l54 32h68l-8-22l8-23v-23l-32-54l32-54v-32l-32-48l32-48v-32l-32-54l32-54V0l-22 8l-23-8h-23l-54 32l-54-32h-32l-48 32l-48-32h-32l-54 32L68 0z\"\u002F>\u003Cpath fill=\"#0052b4\" d=\"M336 0v108L444 0Zm176 68L404 176h108zM0 176h108L0 68ZM68 0l108 108V0Zm108 512V404L68 512ZM0 444l108-108H0Zm512-108H404l108 108Zm-68 176L336 404v108z\"\u002F>\u003Cpath fill=\"#d80027\" d=\"M0 0v45l131 131h45zm208 0v208H0v96h208v208h96V304h208v-96H304V0zm259 0L336 131v45L512 0zM176 336L0 512h45l131-131zm160 0l176 176v-45L381 336z\"\u002F>\u003C\u002Fg>",1781544428026]