/** * Embedding pipeline — text (BGE-M3, 1024-dim) + images (CLIP ViT-L/14, 768-dim). * Uses @huggingface/transformers (ONNX Runtime) for local inference. * * NOTE: This is a stub implementation. The actual @huggingface/transformers * integration will be wired once we confirm ONNX model availability. * For now, it provides the CLI structure and DB queries. */ import { program } from "commander"; import { query, closePool, now } from "./db.js"; interface EmbeddableRow { ref_type: string; ref_id: string; text: string; } async function getUnembeddedTexts(): Promise { const result = await query(` SELECT 'content' AS ref_type, id AS ref_id, COALESCE(text, '') AS text FROM content WHERE text IS NOT NULL AND length(text) > 0 AND NOT EXISTS ( SELECT 1 FROM text_embeddings te WHERE te.ref_type = 'content' AND te.ref_id = content.id ) UNION ALL SELECT 'entity', id, COALESCE(name,'') || ' ' || COALESCE(notes,'') FROM entities WHERE NOT EXISTS ( SELECT 1 FROM text_embeddings te WHERE te.ref_type = 'entity' AND te.ref_id = entities.id ) UNION ALL SELECT 'annotation', id::text, note FROM annotations WHERE NOT EXISTS ( SELECT 1 FROM text_embeddings te WHERE te.ref_type = 'annotation' AND te.ref_id = annotations.id::text ) `); return result.rows; } interface MediaRow { id: string; file_path: string; entity_id: string | null; } async function getUnembeddedImages(): Promise { const result = await query(` SELECT m.id, m.file_path, m.entity_id FROM media m WHERE m.type IN ('image', 'screenshot', 'thumbnail') AND NOT EXISTS ( SELECT 1 FROM image_embeddings ie WHERE ie.media_id = m.id ) `); return result.rows; } async function embedTexts(): Promise { const rows = await getUnembeddedTexts(); if (rows.length === 0) { console.log("Text embeddings: nothing new to embed."); return; } console.log(`Text embeddings: ${rows.length} items to embed.`); // TODO: Wire @huggingface/transformers ONNX pipeline // const pipe = await pipeline('feature-extraction', 'BAAI/bge-m3', { quantized: true }); // const BATCH = 32; // for (let i = 0; i < rows.length; i += BATCH) { // const batch = rows.slice(i, i + BATCH); // const texts = batch.map(r => r.text.slice(0, 2000)); // const embeddings = await pipe(texts, { pooling: 'cls', normalize: true }); // for (let j = 0; j < batch.length; j++) { // const vec = Array.from(embeddings[j].data); // await query( // `INSERT INTO text_embeddings (ref_type, ref_id, text, embedding) // VALUES ($1, $2, $3, $4) // ON CONFLICT (ref_type, ref_id, chunk_idx, model) DO NOTHING`, // [batch[j].ref_type, batch[j].ref_id, texts[j], JSON.stringify(vec)] // ); // } // console.log(` ${Math.min(i + BATCH, rows.length)}/${rows.length}`); // } console.log( " [stub] @huggingface/transformers ONNX integration pending.\n" + " Add @huggingface/transformers and onnxruntime-node to package.json/package-lock.json, then rebuild the devenv.\n" + " Then uncomment the pipeline code in src/embed.ts", ); } async function embedImages(): Promise { const rows = await getUnembeddedImages(); if (rows.length === 0) { console.log("Image embeddings: nothing new to embed."); return; } console.log(`Image embeddings: ${rows.length} items to embed.`); // TODO: Wire @huggingface/transformers ONNX CLIP pipeline // const pipe = await pipeline('image-feature-extraction', 'openai/clip-vit-large-patch14'); // for (const row of rows) { // try { // const embedding = await pipe(row.file_path, { pooling: 'cls', normalize: true }); // const vec = Array.from(embedding[0].data); // await query( // `INSERT INTO image_embeddings (media_id, embedding) // VALUES ($1, $2) // ON CONFLICT (media_id, model) DO NOTHING`, // [row.id, JSON.stringify(vec)] // ); // } catch (err) { // console.warn(` CLIP error on ${row.file_path}: ${err}`); // } // } console.log( " [stub] CLIP ONNX integration pending.\n" + " Add @huggingface/transformers and onnxruntime-node to package.json/package-lock.json, then rebuild the devenv.", ); } async function main(): Promise { program .name("db-embed") .description("Embedding pipeline — text (BGE-M3) + images (CLIP)") .option("--text-only", "Only embed text") .option("--images-only", "Only embed images") .parse(); const opts = program.opts<{ textOnly?: boolean; imagesOnly?: boolean }>(); try { if (!opts.imagesOnly) { console.log("=== Text embedding pass ==="); await embedTexts(); } if (!opts.textOnly) { console.log("=== Image embedding pass ==="); await embedImages(); } console.log("=== Embedding pipeline complete ==="); } finally { await closePool(); } } main().catch((err) => { console.error("Fatal:", err); process.exit(1); });