From 94e64c952790658094dbad24c206e228b58922c8 Mon Sep 17 00:00:00 2001 From: Daniel Shiffman Date: Sat, 31 Aug 2024 17:37:19 -0400 Subject: [PATCH 1/2] getting embeddings from markdown NOC files --- embeddings-transformers-noc.js | 79 ++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 embeddings-transformers-noc.js diff --git a/embeddings-transformers-noc.js b/embeddings-transformers-noc.js new file mode 100644 index 0000000..3f9e41f --- /dev/null +++ b/embeddings-transformers-noc.js @@ -0,0 +1,79 @@ +import * as fs from 'fs'; +import { pipeline } from '@xenova/transformers'; + +// Load the embeddings model +const extractor = await pipeline('feature-extraction', 'Xenova/bge-small-en-v1.5'); + +const fullOutput = []; + +(async () => { + // Scan transcripts directory for all json files + const files = fs.readdirSync('transcripts/markdown'); + + // Iterate through each file and calculate the embeddings + for (const file of files) { + const text = fs.readFileSync(`transcripts/markdown/${file}`, 'utf-8'); + // const json = JSON.parse(rawContents); + + // Calculate chunks based on this text + const chunks = calculateMarkdownChunks(text); + + // Extract embeddings for each chunk + const output = []; + + for (const chunk of chunks) { + const embeddingOutput = await extractor(chunk, { + pooling: 'mean', + normalize: true, + }); + + const embedding = embeddingOutput.tolist()[0]; + output.push({ text: chunk, embedding }); + fullOutput.push({ text: chunk, embedding }); + } + + // Save the embeddings to a file + const fileOut = `embeddings/${file}`; + fs.writeFileSync(fileOut, JSON.stringify(output)); + + console.log( + `Embeddings saved for ${file} to ${fileOut} (${output.length} chunks) (${ + files.indexOf(file) + 1 + }/${files.length})` + ); + } + + // Save the full output to a single file + const fileOut = `embeddings.json`; + fs.writeFileSync(fileOut, JSON.stringify(fullOutput)); + console.log(`Complete embeddings saved to ${fileOut}`); +})(); + +function calculateMarkdownChunks(text) { + const chunks = []; + const lines = text.split('\n'); + let chunk = ''; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i].trim(); + + // Check if the line is a header (starts with #) + if (line.startsWith('#')) { + // If we have accumulated a chunk, push it before starting a new one + if (chunk) { + chunks.push(chunk.trim()); + chunk = ''; + } + } + + // Add the line to the current chunk + chunk += line + '\n'; + } + + // Push the last chunk if any + if (chunk) { + chunks.push(chunk.trim()); + } + + return chunks; +} From 7bc553a2e70038ff45f8ada6ad9afdee302fdb4f Mon Sep 17 00:00:00 2001 From: Daniel Shiffman Date: Sun, 1 Sep 2024 17:18:02 -0400 Subject: [PATCH 2/2] fix filename --- embeddings-transformers-noc.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/embeddings-transformers-noc.js b/embeddings-transformers-noc.js index 3f9e41f..d24e284 100644 --- a/embeddings-transformers-noc.js +++ b/embeddings-transformers-noc.js @@ -33,7 +33,7 @@ const fullOutput = []; } // Save the embeddings to a file - const fileOut = `embeddings/${file}`; + const fileOut = `embeddings/${file.replace('.md', '.json')}`; fs.writeFileSync(fileOut, JSON.stringify(output)); console.log(