From deb9a494a0a688a64f08f42e87e12ea2ca14ffa7 Mon Sep 17 00:00:00 2001 From: Daniel Arroyo Date: Mon, 17 Mar 2025 15:49:07 +0100 Subject: [PATCH 1/6] feat(text_splitters): Add SemanticChunker splitter --- .../text_splitters/sematic_chunker.ts | 306 ++++++++++++++++++ 1 file changed, 306 insertions(+) create mode 100644 libs/langchain-community/src/experimental/text_splitters/sematic_chunker.ts diff --git a/libs/langchain-community/src/experimental/text_splitters/sematic_chunker.ts b/libs/langchain-community/src/experimental/text_splitters/sematic_chunker.ts new file mode 100644 index 000000000000..d585a69bc372 --- /dev/null +++ b/libs/langchain-community/src/experimental/text_splitters/sematic_chunker.ts @@ -0,0 +1,306 @@ +import { BaseDocumentTransformer, Document } from "@langchain/core/documents" +import { Embeddings } from "@langchain/core/embeddings" + +type Sentence = { + sentence: string + combined_sentence_embedding?: number[] + combined_sentence?: string + distance_to_next?: number +} + +type SentenceWithCombinedSentence = Sentence & { combined_sentence: string } +type SentenceWithEmbedding = Sentence & { combined_sentence_embedding: number[]; distance_to_next: number } + +function combineSentences(sentences: Sentence[], sentecesToCombine: number = 1) { + /** + * Combine sentences based on buffer size. + * + * @param sentences - List of sentences to combine. + * @param sentecesToCombine - Number of sentences to combine. Defaults to 1. + * @returns List of sentences with combined sentences. + */ + + for (let i = 0; i < sentences.length; i++) { + let combinedSentence = "" + + // Add sentences before the current one + for (let j = i - sentecesToCombine; j < i; j++) { + if (j >= 0) { + combinedSentence += sentences[j].sentence + " " + } + } + + // Add the current sentence + combinedSentence += sentences[i].sentence + + // Add sentences after the current one + for (let j = i + 1; j < i + 1 + sentecesToCombine; j++) { + if (j < sentences.length) { + combinedSentence += " " + sentences[j].sentence + } + } + + // Store the combined sentence in the current sentence object + sentences[i].combined_sentence = combinedSentence + } + + return sentences as SentenceWithCombinedSentence[] +} + +function calculateCosineDistances(sentences: SentenceWithEmbedding[]): [number[], SentenceWithEmbedding[]] { + /** + * Calculate cosine distances between sentences. + * + * @param sentences - List of sentences to calculate distances for. + * @returns Tuple of distances and sentences. + */ + + const distances: number[] = [] + + for (let i = 0; i < sentences.length - 1; i++) { + const embeddingCurrent = sentences[i].combined_sentence_embedding + const embeddingNext = sentences[i + 1].combined_sentence_embedding + + // Calculate cosine similarity + const similarity = cosineSimilarity(embeddingCurrent, embeddingNext) + + // Convert to cosine distance + const distance = 1 - similarity + + // Append cosine distance to the list + distances.push(distance) + + // Store distance in the dictionary + sentences[i].distance_to_next = distance + } + + return [distances, sentences] +} + +// Utility function for cosine similarity +function cosineSimilarity(vecA: number[], vecB: number[]) { + const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0) + const magnitudeA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0)) + const magnitudeB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0)) + + return magnitudeA && magnitudeB ? dotProduct / (magnitudeA * magnitudeB) : 0 +} + +enum BreakpointThresholdType { + PERCENTILE = "percentile", + STANDARD_DEVIATION = "standard_deviation", + INTERQUARTILE = "interquartile", + GRADIENT = "gradient", +} + +const BREAKPOINT_DEFAULTS: Record = { + percentile: 95, + standard_deviation: 3, + interquartile: 1.5, + gradient: 95, +} + +interface SemanticChunkerOptions { + sentecesToCombine?: number + sentenceSplitRegex?: RegExp + addStartIndex?: boolean + breakpointThresholdType?: BreakpointThresholdType + breakpointThresholdAmount?: number + numberOfChunks?: number + minChunkSize?: number +} + +export class SemanticChunker extends BaseDocumentTransformer { + private sentecesToCombine: number = 1 + private sentenceSplitRegex: RegExp = new RegExp(`(?<=[.?!])\\s+`) + private addStartIndex: boolean = false + private breakpointThresholdType: BreakpointThresholdType = BreakpointThresholdType.PERCENTILE + private breakpointThresholdAmount: number = BREAKPOINT_DEFAULTS[this.breakpointThresholdType] + private numberOfChunks?: number + private minChunkSize?: number + + constructor( + private embeddings: Embeddings, + options?: SemanticChunkerOptions + ) { + super() + + if (options) { + if (options.sentecesToCombine !== undefined) this.sentecesToCombine = options.sentecesToCombine + if (options.sentenceSplitRegex !== undefined) this.sentenceSplitRegex = new RegExp(options.sentenceSplitRegex) + if (options.addStartIndex !== undefined) this.addStartIndex = options.addStartIndex + if (options.breakpointThresholdType !== undefined) { + this.breakpointThresholdType = options.breakpointThresholdType + this.breakpointThresholdAmount = BREAKPOINT_DEFAULTS[options.breakpointThresholdType] + } + if (options.breakpointThresholdAmount !== undefined) this.breakpointThresholdAmount = options.breakpointThresholdAmount + if (options.numberOfChunks !== undefined) this.numberOfChunks = options.numberOfChunks + if (options.minChunkSize !== undefined) this.minChunkSize = options.minChunkSize + } + } + + // Utility functions + private mean(arr: number[]): number { + return arr.reduce((a, b) => a + b, 0) / arr.length + } + + private std(arr: number[]): number { + const mean = this.mean(arr) + return Math.sqrt(arr.map((x) => (x - mean) ** 2).reduce((a, b) => a + b, 0) / arr.length) + } + + private percentile(arr: number[], p: number): number { + const sorted = [...arr].sort((a, b) => a - b) + const index = Math.ceil((p / 100) * sorted.length) - 1 + return sorted[Math.max(index, 0)] + } + + private percentileValues(arr: number[], percentiles: number[]): number[] { + return percentiles.map((p) => this.percentile(arr, p)) + } + + private gradient(arr: number[]): number[] { + return arr.map((_, i, a) => (i === 0 ? a[i + 1] - a[i] : a[i] - a[i - 1])).slice(1) + } + + private _calculateBreakpointThreshold(distances: number[]): [number, number[]] { + switch (this.breakpointThresholdType) { + case BreakpointThresholdType.PERCENTILE: + return [this.percentile(distances, this.breakpointThresholdAmount), distances] + + case BreakpointThresholdType.STANDARD_DEVIATION: + return [this.mean(distances) + this.breakpointThresholdAmount * this.std(distances), distances] + + case BreakpointThresholdType.INTERQUARTILE: + const [q1, q3] = this.percentileValues(distances, [25, 75]) + const iqr = q3 - q1 + return [this.mean(distances) + this.breakpointThresholdAmount * iqr, distances] + + case BreakpointThresholdType.GRADIENT: + const distanceGradient = this.gradient(distances) + return [this.percentile(distanceGradient, this.breakpointThresholdAmount), distanceGradient] + + default: + throw new Error(`Unexpected breakpointThresholdType: ${this.breakpointThresholdType}`) + } + } + + private _thresholdFromClusters(distances: number[]): number { + if (this.numberOfChunks === undefined) { + throw new Error("This should never be called if `numberOfChunks` is undefined.") + } + + const x1 = distances.length + const y1 = 0.0 + const x2 = 1.0 + const y2 = 100.0 + + const x = Math.max(Math.min(this.numberOfChunks, x1), x2) + const y = x2 === x1 ? y2 : y1 + ((y2 - y1) / (x2 - x1)) * (x - x1) + + return this.percentile(distances, Math.min(Math.max(y, 0), 100)) + } + + private async _calculateSentenceDistances(singleSentencesList: string[]): Promise<[number[], any[]]> { + const sentences = combineSentences( + singleSentencesList.map((sentence) => ({ sentence })), + this.sentecesToCombine + ) + + const embeddings = await this.embeddings.embedDocuments(sentences.map((x) => x.combined_sentence)) + + sentences.forEach((sentence, i) => { + sentence.combined_sentence_embedding = embeddings[i] + }) + + return calculateCosineDistances(sentences as SentenceWithEmbedding[]) + } + + private _getSingleSentencesList(text: string): string[] { + return text.split(this.sentenceSplitRegex) + } + + async splitText(text: string): Promise { + const singleSentencesList = this._getSingleSentencesList(text) + + if (singleSentencesList.length <= 1) return singleSentencesList + + if (this.breakpointThresholdType === BreakpointThresholdType.GRADIENT && singleSentencesList.length === 2) { + return singleSentencesList + } + + const [distances, sentences] = await this._calculateSentenceDistances(singleSentencesList) + let breakpointDistanceThreshold: number + let breakpointArray: number[] + + if (this.numberOfChunks !== undefined) { + breakpointDistanceThreshold = this._thresholdFromClusters(distances) + breakpointArray = distances + } else { + ;[breakpointDistanceThreshold, breakpointArray] = this._calculateBreakpointThreshold(distances) + } + + const indicesAboveThresh = breakpointArray.map((x, i) => (x > breakpointDistanceThreshold ? i : -1)).filter((i) => i !== -1) + + const chunks: string[] = [] + let startIndex = 0 + + for (const index of indicesAboveThresh) { + const endIndex = index + const group = sentences.slice(startIndex, endIndex + 1) + const combinedText = group.map((d) => d.sentence).join(" ") + + if (this.minChunkSize !== undefined && combinedText.length < this.minChunkSize) continue + chunks.push(combinedText) + startIndex = index + 1 + } + + if (startIndex < sentences.length) { + chunks.push( + sentences + .slice(startIndex) + .map((d) => d.sentence) + .join(" ") + ) + } + + return chunks + } + + async createDocuments(texts: string[], metadatas?: Record[]): Promise { + const _metadatas = metadatas || Array(texts.length).fill({}) + const documents: Document[] = [] + + for (const [i, text] of texts.entries()) { + let startIndex = 0 + const chunks = await this.splitText(text) + + chunks.forEach((chunk) => { + const metadata = { ..._metadatas[i] } + + if (this.addStartIndex) metadata.start_index = startIndex + + documents.push(new Document({ pageContent: chunk, metadata })) + startIndex += chunk.length + }) + } + + return documents + } + + async splitDocuments(documents: Document[]): Promise { + const texts: string[] = [] + const metadatas: Record[] = [] + + for (const doc of documents) { + texts.push(doc.pageContent) + metadatas.push(doc.metadata) + } + + return this.createDocuments(texts, metadatas) + } + + async transformDocuments(documents: Document[]): Promise { + return this.splitDocuments(documents) + } +} From 6550f668cbdf2785ad9f0b05d4957fe22c35eba5 Mon Sep 17 00:00:00 2001 From: Daniel Arroyo Date: Mon, 17 Mar 2025 15:52:42 +0100 Subject: [PATCH 2/6] fix(experimental): type in file name --- .../text_splitters/{sematic_chunker.ts => semantic_chunker.ts} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename libs/langchain-community/src/experimental/text_splitters/{sematic_chunker.ts => semantic_chunker.ts} (100%) diff --git a/libs/langchain-community/src/experimental/text_splitters/sematic_chunker.ts b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts similarity index 100% rename from libs/langchain-community/src/experimental/text_splitters/sematic_chunker.ts rename to libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts From 0e941769550a2e28aa27584c44d955843896ffa2 Mon Sep 17 00:00:00 2001 From: Daniel Arroyo Date: Mon, 31 Mar 2025 19:37:28 +0200 Subject: [PATCH 3/6] fix: Use existing cosineSimilarity function --- .../experimental/text_splitters/semantic_chunker.ts | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts index d585a69bc372..8df7541197c8 100644 --- a/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts +++ b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts @@ -1,5 +1,6 @@ import { BaseDocumentTransformer, Document } from "@langchain/core/documents" import { Embeddings } from "@langchain/core/embeddings" +import { cosineSimilarity } from "@langchain/core/utils/math"; type Sentence = { sentence: string @@ -77,15 +78,6 @@ function calculateCosineDistances(sentences: SentenceWithEmbedding[]): [number[] return [distances, sentences] } -// Utility function for cosine similarity -function cosineSimilarity(vecA: number[], vecB: number[]) { - const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0) - const magnitudeA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0)) - const magnitudeB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0)) - - return magnitudeA && magnitudeB ? dotProduct / (magnitudeA * magnitudeB) : 0 -} - enum BreakpointThresholdType { PERCENTILE = "percentile", STANDARD_DEVIATION = "standard_deviation", From a69b8fcd7e67500075a5587911f41484741f2e1c Mon Sep 17 00:00:00 2001 From: Daniel Arroyo Date: Mon, 31 Mar 2025 19:42:00 +0200 Subject: [PATCH 4/6] fix: nit nit: senteces -> sentences --- .../text_splitters/semantic_chunker.ts | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts index 8df7541197c8..2acbe7342d9d 100644 --- a/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts +++ b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts @@ -12,12 +12,12 @@ type Sentence = { type SentenceWithCombinedSentence = Sentence & { combined_sentence: string } type SentenceWithEmbedding = Sentence & { combined_sentence_embedding: number[]; distance_to_next: number } -function combineSentences(sentences: Sentence[], sentecesToCombine: number = 1) { +function combineSentences(sentences: Sentence[], sentencesToCombine: number = 1) { /** * Combine sentences based on buffer size. * * @param sentences - List of sentences to combine. - * @param sentecesToCombine - Number of sentences to combine. Defaults to 1. + * @param sentencesToCombine - Number of sentences to combine. Defaults to 1. * @returns List of sentences with combined sentences. */ @@ -25,7 +25,7 @@ function combineSentences(sentences: Sentence[], sentecesToCombine: number = 1) let combinedSentence = "" // Add sentences before the current one - for (let j = i - sentecesToCombine; j < i; j++) { + for (let j = i - sentencesToCombine; j < i; j++) { if (j >= 0) { combinedSentence += sentences[j].sentence + " " } @@ -35,7 +35,7 @@ function combineSentences(sentences: Sentence[], sentecesToCombine: number = 1) combinedSentence += sentences[i].sentence // Add sentences after the current one - for (let j = i + 1; j < i + 1 + sentecesToCombine; j++) { + for (let j = i + 1; j < i + 1 + sentencesToCombine; j++) { if (j < sentences.length) { combinedSentence += " " + sentences[j].sentence } @@ -93,7 +93,7 @@ const BREAKPOINT_DEFAULTS: Record = { } interface SemanticChunkerOptions { - sentecesToCombine?: number + sentencesToCombine?: number sentenceSplitRegex?: RegExp addStartIndex?: boolean breakpointThresholdType?: BreakpointThresholdType @@ -103,7 +103,7 @@ interface SemanticChunkerOptions { } export class SemanticChunker extends BaseDocumentTransformer { - private sentecesToCombine: number = 1 + private sentencesToCombine: number = 1 private sentenceSplitRegex: RegExp = new RegExp(`(?<=[.?!])\\s+`) private addStartIndex: boolean = false private breakpointThresholdType: BreakpointThresholdType = BreakpointThresholdType.PERCENTILE @@ -118,7 +118,7 @@ export class SemanticChunker extends BaseDocumentTransformer { super() if (options) { - if (options.sentecesToCombine !== undefined) this.sentecesToCombine = options.sentecesToCombine + if (options.sentencesToCombine !== undefined) this.sentencesToCombine = options.sentencesToCombine if (options.sentenceSplitRegex !== undefined) this.sentenceSplitRegex = new RegExp(options.sentenceSplitRegex) if (options.addStartIndex !== undefined) this.addStartIndex = options.addStartIndex if (options.breakpointThresholdType !== undefined) { @@ -196,7 +196,7 @@ export class SemanticChunker extends BaseDocumentTransformer { private async _calculateSentenceDistances(singleSentencesList: string[]): Promise<[number[], any[]]> { const sentences = combineSentences( singleSentencesList.map((sentence) => ({ sentence })), - this.sentecesToCombine + this.sentencesToCombine ) const embeddings = await this.embeddings.embedDocuments(sentences.map((x) => x.combined_sentence)) From 1e369a3492be66a40f9a64658d040d730729ddf5 Mon Sep 17 00:00:00 2001 From: Daniel Arroyo Date: Mon, 31 Mar 2025 19:49:14 +0200 Subject: [PATCH 5/6] fix: execute yarn format --- .../text_splitters/semantic_chunker.ts | 301 +++++++++++------- 1 file changed, 180 insertions(+), 121 deletions(-) diff --git a/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts index 2acbe7342d9d..46322eb5d5a9 100644 --- a/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts +++ b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts @@ -1,18 +1,24 @@ -import { BaseDocumentTransformer, Document } from "@langchain/core/documents" -import { Embeddings } from "@langchain/core/embeddings" +import { BaseDocumentTransformer, Document } from "@langchain/core/documents"; +import { Embeddings } from "@langchain/core/embeddings"; import { cosineSimilarity } from "@langchain/core/utils/math"; type Sentence = { - sentence: string - combined_sentence_embedding?: number[] - combined_sentence?: string - distance_to_next?: number -} - -type SentenceWithCombinedSentence = Sentence & { combined_sentence: string } -type SentenceWithEmbedding = Sentence & { combined_sentence_embedding: number[]; distance_to_next: number } - -function combineSentences(sentences: Sentence[], sentencesToCombine: number = 1) { + sentence: string; + combined_sentence_embedding?: number[]; + combined_sentence?: string; + distance_to_next?: number; +}; + +type SentenceWithCombinedSentence = Sentence & { combined_sentence: string }; +type SentenceWithEmbedding = Sentence & { + combined_sentence_embedding: number[]; + distance_to_next: number; +}; + +function combineSentences( + sentences: Sentence[], + sentencesToCombine: number = 1 +) { /** * Combine sentences based on buffer size. * @@ -22,33 +28,35 @@ function combineSentences(sentences: Sentence[], sentencesToCombine: number = 1) */ for (let i = 0; i < sentences.length; i++) { - let combinedSentence = "" + let combinedSentence = ""; // Add sentences before the current one for (let j = i - sentencesToCombine; j < i; j++) { if (j >= 0) { - combinedSentence += sentences[j].sentence + " " + combinedSentence += sentences[j].sentence + " "; } } // Add the current sentence - combinedSentence += sentences[i].sentence + combinedSentence += sentences[i].sentence; // Add sentences after the current one for (let j = i + 1; j < i + 1 + sentencesToCombine; j++) { if (j < sentences.length) { - combinedSentence += " " + sentences[j].sentence + combinedSentence += " " + sentences[j].sentence; } } // Store the combined sentence in the current sentence object - sentences[i].combined_sentence = combinedSentence + sentences[i].combined_sentence = combinedSentence; } - return sentences as SentenceWithCombinedSentence[] + return sentences as SentenceWithCombinedSentence[]; } -function calculateCosineDistances(sentences: SentenceWithEmbedding[]): [number[], SentenceWithEmbedding[]] { +function calculateCosineDistances( + sentences: SentenceWithEmbedding[] +): [number[], SentenceWithEmbedding[]] { /** * Calculate cosine distances between sentences. * @@ -56,26 +64,26 @@ function calculateCosineDistances(sentences: SentenceWithEmbedding[]): [number[] * @returns Tuple of distances and sentences. */ - const distances: number[] = [] + const distances: number[] = []; for (let i = 0; i < sentences.length - 1; i++) { - const embeddingCurrent = sentences[i].combined_sentence_embedding - const embeddingNext = sentences[i + 1].combined_sentence_embedding + const embeddingCurrent = sentences[i].combined_sentence_embedding; + const embeddingNext = sentences[i + 1].combined_sentence_embedding; // Calculate cosine similarity - const similarity = cosineSimilarity(embeddingCurrent, embeddingNext) + const similarity = cosineSimilarity(embeddingCurrent, embeddingNext); // Convert to cosine distance - const distance = 1 - similarity + const distance = 1 - similarity; // Append cosine distance to the list - distances.push(distance) + distances.push(distance); // Store distance in the dictionary - sentences[i].distance_to_next = distance + sentences[i].distance_to_next = distance; } - return [distances, sentences] + return [distances, sentences]; } enum BreakpointThresholdType { @@ -90,161 +98,209 @@ const BREAKPOINT_DEFAULTS: Record = { standard_deviation: 3, interquartile: 1.5, gradient: 95, -} +}; interface SemanticChunkerOptions { - sentencesToCombine?: number - sentenceSplitRegex?: RegExp - addStartIndex?: boolean - breakpointThresholdType?: BreakpointThresholdType - breakpointThresholdAmount?: number - numberOfChunks?: number - minChunkSize?: number + sentencesToCombine?: number; + sentenceSplitRegex?: RegExp; + addStartIndex?: boolean; + breakpointThresholdType?: BreakpointThresholdType; + breakpointThresholdAmount?: number; + numberOfChunks?: number; + minChunkSize?: number; } export class SemanticChunker extends BaseDocumentTransformer { - private sentencesToCombine: number = 1 - private sentenceSplitRegex: RegExp = new RegExp(`(?<=[.?!])\\s+`) - private addStartIndex: boolean = false - private breakpointThresholdType: BreakpointThresholdType = BreakpointThresholdType.PERCENTILE - private breakpointThresholdAmount: number = BREAKPOINT_DEFAULTS[this.breakpointThresholdType] - private numberOfChunks?: number - private minChunkSize?: number + private sentencesToCombine: number = 1; + private sentenceSplitRegex: RegExp = new RegExp(`(?<=[.?!])\\s+`); + private addStartIndex: boolean = false; + private breakpointThresholdType: BreakpointThresholdType = + BreakpointThresholdType.PERCENTILE; + private breakpointThresholdAmount: number = + BREAKPOINT_DEFAULTS[this.breakpointThresholdType]; + private numberOfChunks?: number; + private minChunkSize?: number; constructor( private embeddings: Embeddings, options?: SemanticChunkerOptions ) { - super() + super(); if (options) { - if (options.sentencesToCombine !== undefined) this.sentencesToCombine = options.sentencesToCombine - if (options.sentenceSplitRegex !== undefined) this.sentenceSplitRegex = new RegExp(options.sentenceSplitRegex) - if (options.addStartIndex !== undefined) this.addStartIndex = options.addStartIndex + if (options.sentencesToCombine !== undefined) + this.sentencesToCombine = options.sentencesToCombine; + if (options.sentenceSplitRegex !== undefined) + this.sentenceSplitRegex = new RegExp(options.sentenceSplitRegex); + if (options.addStartIndex !== undefined) + this.addStartIndex = options.addStartIndex; if (options.breakpointThresholdType !== undefined) { - this.breakpointThresholdType = options.breakpointThresholdType - this.breakpointThresholdAmount = BREAKPOINT_DEFAULTS[options.breakpointThresholdType] + this.breakpointThresholdType = options.breakpointThresholdType; + this.breakpointThresholdAmount = + BREAKPOINT_DEFAULTS[options.breakpointThresholdType]; } - if (options.breakpointThresholdAmount !== undefined) this.breakpointThresholdAmount = options.breakpointThresholdAmount - if (options.numberOfChunks !== undefined) this.numberOfChunks = options.numberOfChunks - if (options.minChunkSize !== undefined) this.minChunkSize = options.minChunkSize + if (options.breakpointThresholdAmount !== undefined) + this.breakpointThresholdAmount = options.breakpointThresholdAmount; + if (options.numberOfChunks !== undefined) + this.numberOfChunks = options.numberOfChunks; + if (options.minChunkSize !== undefined) + this.minChunkSize = options.minChunkSize; } } // Utility functions private mean(arr: number[]): number { - return arr.reduce((a, b) => a + b, 0) / arr.length + return arr.reduce((a, b) => a + b, 0) / arr.length; } private std(arr: number[]): number { - const mean = this.mean(arr) - return Math.sqrt(arr.map((x) => (x - mean) ** 2).reduce((a, b) => a + b, 0) / arr.length) + const mean = this.mean(arr); + return Math.sqrt( + arr.map((x) => (x - mean) ** 2).reduce((a, b) => a + b, 0) / arr.length + ); } private percentile(arr: number[], p: number): number { - const sorted = [...arr].sort((a, b) => a - b) - const index = Math.ceil((p / 100) * sorted.length) - 1 - return sorted[Math.max(index, 0)] + const sorted = [...arr].sort((a, b) => a - b); + const index = Math.ceil((p / 100) * sorted.length) - 1; + return sorted[Math.max(index, 0)]; } private percentileValues(arr: number[], percentiles: number[]): number[] { - return percentiles.map((p) => this.percentile(arr, p)) + return percentiles.map((p) => this.percentile(arr, p)); } private gradient(arr: number[]): number[] { - return arr.map((_, i, a) => (i === 0 ? a[i + 1] - a[i] : a[i] - a[i - 1])).slice(1) + return arr + .map((_, i, a) => (i === 0 ? a[i + 1] - a[i] : a[i] - a[i - 1])) + .slice(1); } - private _calculateBreakpointThreshold(distances: number[]): [number, number[]] { + private _calculateBreakpointThreshold( + distances: number[] + ): [number, number[]] { switch (this.breakpointThresholdType) { case BreakpointThresholdType.PERCENTILE: - return [this.percentile(distances, this.breakpointThresholdAmount), distances] + return [ + this.percentile(distances, this.breakpointThresholdAmount), + distances, + ]; case BreakpointThresholdType.STANDARD_DEVIATION: - return [this.mean(distances) + this.breakpointThresholdAmount * this.std(distances), distances] + return [ + this.mean(distances) + + this.breakpointThresholdAmount * this.std(distances), + distances, + ]; case BreakpointThresholdType.INTERQUARTILE: - const [q1, q3] = this.percentileValues(distances, [25, 75]) - const iqr = q3 - q1 - return [this.mean(distances) + this.breakpointThresholdAmount * iqr, distances] + const [q1, q3] = this.percentileValues(distances, [25, 75]); + const iqr = q3 - q1; + return [ + this.mean(distances) + this.breakpointThresholdAmount * iqr, + distances, + ]; case BreakpointThresholdType.GRADIENT: - const distanceGradient = this.gradient(distances) - return [this.percentile(distanceGradient, this.breakpointThresholdAmount), distanceGradient] + const distanceGradient = this.gradient(distances); + return [ + this.percentile(distanceGradient, this.breakpointThresholdAmount), + distanceGradient, + ]; default: - throw new Error(`Unexpected breakpointThresholdType: ${this.breakpointThresholdType}`) + throw new Error( + `Unexpected breakpointThresholdType: ${this.breakpointThresholdType}` + ); } } private _thresholdFromClusters(distances: number[]): number { if (this.numberOfChunks === undefined) { - throw new Error("This should never be called if `numberOfChunks` is undefined.") + throw new Error( + "This should never be called if `numberOfChunks` is undefined." + ); } - const x1 = distances.length - const y1 = 0.0 - const x2 = 1.0 - const y2 = 100.0 + const x1 = distances.length; + const y1 = 0.0; + const x2 = 1.0; + const y2 = 100.0; - const x = Math.max(Math.min(this.numberOfChunks, x1), x2) - const y = x2 === x1 ? y2 : y1 + ((y2 - y1) / (x2 - x1)) * (x - x1) + const x = Math.max(Math.min(this.numberOfChunks, x1), x2); + const y = x2 === x1 ? y2 : y1 + ((y2 - y1) / (x2 - x1)) * (x - x1); - return this.percentile(distances, Math.min(Math.max(y, 0), 100)) + return this.percentile(distances, Math.min(Math.max(y, 0), 100)); } - private async _calculateSentenceDistances(singleSentencesList: string[]): Promise<[number[], any[]]> { + private async _calculateSentenceDistances( + singleSentencesList: string[] + ): Promise<[number[], any[]]> { const sentences = combineSentences( singleSentencesList.map((sentence) => ({ sentence })), this.sentencesToCombine - ) + ); - const embeddings = await this.embeddings.embedDocuments(sentences.map((x) => x.combined_sentence)) + const embeddings = await this.embeddings.embedDocuments( + sentences.map((x) => x.combined_sentence) + ); sentences.forEach((sentence, i) => { - sentence.combined_sentence_embedding = embeddings[i] - }) + sentence.combined_sentence_embedding = embeddings[i]; + }); - return calculateCosineDistances(sentences as SentenceWithEmbedding[]) + return calculateCosineDistances(sentences as SentenceWithEmbedding[]); } private _getSingleSentencesList(text: string): string[] { - return text.split(this.sentenceSplitRegex) + return text.split(this.sentenceSplitRegex); } async splitText(text: string): Promise { - const singleSentencesList = this._getSingleSentencesList(text) + const singleSentencesList = this._getSingleSentencesList(text); - if (singleSentencesList.length <= 1) return singleSentencesList + if (singleSentencesList.length <= 1) return singleSentencesList; - if (this.breakpointThresholdType === BreakpointThresholdType.GRADIENT && singleSentencesList.length === 2) { - return singleSentencesList + if ( + this.breakpointThresholdType === BreakpointThresholdType.GRADIENT && + singleSentencesList.length === 2 + ) { + return singleSentencesList; } - const [distances, sentences] = await this._calculateSentenceDistances(singleSentencesList) - let breakpointDistanceThreshold: number - let breakpointArray: number[] + const [distances, sentences] = await this._calculateSentenceDistances( + singleSentencesList + ); + let breakpointDistanceThreshold: number; + let breakpointArray: number[]; if (this.numberOfChunks !== undefined) { - breakpointDistanceThreshold = this._thresholdFromClusters(distances) - breakpointArray = distances + breakpointDistanceThreshold = this._thresholdFromClusters(distances); + breakpointArray = distances; } else { - ;[breakpointDistanceThreshold, breakpointArray] = this._calculateBreakpointThreshold(distances) + [breakpointDistanceThreshold, breakpointArray] = + this._calculateBreakpointThreshold(distances); } - const indicesAboveThresh = breakpointArray.map((x, i) => (x > breakpointDistanceThreshold ? i : -1)).filter((i) => i !== -1) + const indicesAboveThresh = breakpointArray + .map((x, i) => (x > breakpointDistanceThreshold ? i : -1)) + .filter((i) => i !== -1); - const chunks: string[] = [] - let startIndex = 0 + const chunks: string[] = []; + let startIndex = 0; for (const index of indicesAboveThresh) { - const endIndex = index - const group = sentences.slice(startIndex, endIndex + 1) - const combinedText = group.map((d) => d.sentence).join(" ") + const endIndex = index; + const group = sentences.slice(startIndex, endIndex + 1); + const combinedText = group.map((d) => d.sentence).join(" "); - if (this.minChunkSize !== undefined && combinedText.length < this.minChunkSize) continue - chunks.push(combinedText) - startIndex = index + 1 + if ( + this.minChunkSize !== undefined && + combinedText.length < this.minChunkSize + ) + continue; + chunks.push(combinedText); + startIndex = index + 1; } if (startIndex < sentences.length) { @@ -253,46 +309,49 @@ export class SemanticChunker extends BaseDocumentTransformer { .slice(startIndex) .map((d) => d.sentence) .join(" ") - ) + ); } - return chunks + return chunks; } - async createDocuments(texts: string[], metadatas?: Record[]): Promise { - const _metadatas = metadatas || Array(texts.length).fill({}) - const documents: Document[] = [] + async createDocuments( + texts: string[], + metadatas?: Record[] + ): Promise { + const _metadatas = metadatas || Array(texts.length).fill({}); + const documents: Document[] = []; for (const [i, text] of texts.entries()) { - let startIndex = 0 - const chunks = await this.splitText(text) + let startIndex = 0; + const chunks = await this.splitText(text); chunks.forEach((chunk) => { - const metadata = { ..._metadatas[i] } + const metadata = { ..._metadatas[i] }; - if (this.addStartIndex) metadata.start_index = startIndex + if (this.addStartIndex) metadata.start_index = startIndex; - documents.push(new Document({ pageContent: chunk, metadata })) - startIndex += chunk.length - }) + documents.push(new Document({ pageContent: chunk, metadata })); + startIndex += chunk.length; + }); } - return documents + return documents; } async splitDocuments(documents: Document[]): Promise { - const texts: string[] = [] - const metadatas: Record[] = [] + const texts: string[] = []; + const metadatas: Record[] = []; for (const doc of documents) { - texts.push(doc.pageContent) - metadatas.push(doc.metadata) + texts.push(doc.pageContent); + metadatas.push(doc.metadata); } - return this.createDocuments(texts, metadatas) + return this.createDocuments(texts, metadatas); } async transformDocuments(documents: Document[]): Promise { - return this.splitDocuments(documents) + return this.splitDocuments(documents); } } From 36cb00f896dbd284f527d9fb52e1ad92a305f8a1 Mon Sep 17 00:00:00 2001 From: Daniel Arroyo Date: Tue, 22 Apr 2025 20:51:02 +0200 Subject: [PATCH 6/6] test: add unit tests for semantic chunker --- .../text_splitters/semantic_chunker.ts | 14 +- .../tests/semantic_chunker.test.ts | 126 ++++++++++++++++++ 2 files changed, 137 insertions(+), 3 deletions(-) create mode 100644 libs/langchain-community/src/experimental/text_splitters/tests/semantic_chunker.test.ts diff --git a/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts index 46322eb5d5a9..7271faaeaabf 100644 --- a/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts +++ b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts @@ -1,6 +1,5 @@ import { BaseDocumentTransformer, Document } from "@langchain/core/documents"; import { Embeddings } from "@langchain/core/embeddings"; -import { cosineSimilarity } from "@langchain/core/utils/math"; type Sentence = { sentence: string; @@ -15,7 +14,16 @@ type SentenceWithEmbedding = Sentence & { distance_to_next: number; }; -function combineSentences( +// Utility function for cosine similarity +function cosineSimilarity(vecA: number[], vecB: number[]) { + const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0) + const magnitudeA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0)) + const magnitudeB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0)) + + return magnitudeA && magnitudeB ? dotProduct / (magnitudeA * magnitudeB) : 0 +} + +export function combineSentences( sentences: Sentence[], sentencesToCombine: number = 1 ) { @@ -54,7 +62,7 @@ function combineSentences( return sentences as SentenceWithCombinedSentence[]; } -function calculateCosineDistances( +export function calculateCosineDistances( sentences: SentenceWithEmbedding[] ): [number[], SentenceWithEmbedding[]] { /** diff --git a/libs/langchain-community/src/experimental/text_splitters/tests/semantic_chunker.test.ts b/libs/langchain-community/src/experimental/text_splitters/tests/semantic_chunker.test.ts new file mode 100644 index 000000000000..0d6c0100d839 --- /dev/null +++ b/libs/langchain-community/src/experimental/text_splitters/tests/semantic_chunker.test.ts @@ -0,0 +1,126 @@ +import { + combineSentences, + calculateCosineDistances, +} from "../semantic_chunker.js"; + +describe("combineSentences", () => { + test("combines sentences with default sentencesToCombine (1)", () => { + const sentences = [ + { sentence: "This is sentence one." }, + { sentence: "This is sentence two." }, + { sentence: "This is sentence three." }, + ]; + + const result = combineSentences(sentences); + + expect(result).toEqual([ + { + sentence: "This is sentence one.", + combined_sentence: "This is sentence one. This is sentence two.", + }, + { + sentence: "This is sentence two.", + combined_sentence: + "This is sentence one. This is sentence two. This is sentence three.", + }, + { + sentence: "This is sentence three.", + combined_sentence: "This is sentence two. This is sentence three.", + }, + ]); + }); + + test("handles an empty array of sentences", () => { + const sentences: any[] = []; + + const result = combineSentences(sentences); + + expect(result).toEqual([]); + }); + + test("handles a single sentence", () => { + const sentences = [{ sentence: "Only one sentence here." }]; + + const result = combineSentences(sentences); + + expect(result).toEqual([ + { + sentence: "Only one sentence here.", + combined_sentence: "Only one sentence here.", + }, + ]); + }); +}); + +describe("calculateCosineDistances", () => { + test("calculates cosine distances between sentence embeddings", () => { + const sentences = [ + { + sentence: "Sentence one.", + combined_sentence_embedding: [1, 0, 0], + distance_to_next: 0, + }, + { + sentence: "Sentence two.", + combined_sentence_embedding: [0, 1, 0], + distance_to_next: 0, + }, + { + sentence: "Sentence three.", + combined_sentence_embedding: [0, 0, 1], + distance_to_next: 0, + }, + ]; + + const [distances, updatedSentences] = calculateCosineDistances(sentences); + + expect(distances).toEqual([1, 1]); + expect(updatedSentences).toEqual([ + { + sentence: "Sentence one.", + combined_sentence_embedding: [1, 0, 0], + distance_to_next: 1, + }, + { + sentence: "Sentence two.", + combined_sentence_embedding: [0, 1, 0], + distance_to_next: 1, + }, + { + sentence: "Sentence three.", + combined_sentence_embedding: [0, 0, 1], + distance_to_next: 0, + }, + ]); + }); + + test("handles a single sentence with no distances to calculate", () => { + const sentences = [ + { + sentence: "Only one sentence.", + combined_sentence_embedding: [1, 0, 0], + distance_to_next: 0, + }, + ]; + + const [distances, updatedSentences] = calculateCosineDistances(sentences); + + expect(distances).toEqual([]); + expect(updatedSentences).toEqual([ + { + sentence: "Only one sentence.", + combined_sentence_embedding: [1, 0, 0], + distance_to_next: 0, + }, + ]); + }); + + test("handles an empty array of sentences", () => { + const sentences: any[] = []; + + const [distances, updatedSentences] = calculateCosineDistances(sentences); + + expect(distances).toEqual([]); + expect(updatedSentences).toEqual([]); + }); +});