From deb9a494a0a688a64f08f42e87e12ea2ca14ffa7 Mon Sep 17 00:00:00 2001
From: Daniel Arroyo <daniel@nieveconsulting.com>
Date: Mon, 17 Mar 2025 15:49:07 +0100
Subject: [PATCH 1/6] feat(text_splitters): Add SemanticChunker splitter

---
 .../text_splitters/sematic_chunker.ts         | 306 ++++++++++++++++++
 1 file changed, 306 insertions(+)
 create mode 100644 libs/langchain-community/src/experimental/text_splitters/sematic_chunker.ts

diff --git a/libs/langchain-community/src/experimental/text_splitters/sematic_chunker.ts b/libs/langchain-community/src/experimental/text_splitters/sematic_chunker.ts
new file mode 100644
index 000000000000..d585a69bc372
--- /dev/null
+++ b/libs/langchain-community/src/experimental/text_splitters/sematic_chunker.ts
@@ -0,0 +1,306 @@
+import { BaseDocumentTransformer, Document } from "@langchain/core/documents"
+import { Embeddings } from "@langchain/core/embeddings"
+
+type Sentence = {
+  sentence: string
+  combined_sentence_embedding?: number[]
+  combined_sentence?: string
+  distance_to_next?: number
+}
+
+type SentenceWithCombinedSentence = Sentence & { combined_sentence: string }
+type SentenceWithEmbedding = Sentence & { combined_sentence_embedding: number[]; distance_to_next: number }
+
+function combineSentences(sentences: Sentence[], sentecesToCombine: number = 1) {
+  /**
+   * Combine sentences based on buffer size.
+   *
+   * @param sentences - List of sentences to combine.
+   * @param sentecesToCombine - Number of sentences to combine. Defaults to 1.
+   * @returns List of sentences with combined sentences.
+   */
+
+  for (let i = 0; i < sentences.length; i++) {
+    let combinedSentence = ""
+
+    // Add sentences before the current one
+    for (let j = i - sentecesToCombine; j < i; j++) {
+      if (j >= 0) {
+        combinedSentence += sentences[j].sentence + " "
+      }
+    }
+
+    // Add the current sentence
+    combinedSentence += sentences[i].sentence
+
+    // Add sentences after the current one
+    for (let j = i + 1; j < i + 1 + sentecesToCombine; j++) {
+      if (j < sentences.length) {
+        combinedSentence += " " + sentences[j].sentence
+      }
+    }
+
+    // Store the combined sentence in the current sentence object
+    sentences[i].combined_sentence = combinedSentence
+  }
+
+  return sentences as SentenceWithCombinedSentence[]
+}
+
+function calculateCosineDistances(sentences: SentenceWithEmbedding[]): [number[], SentenceWithEmbedding[]] {
+  /**
+   * Calculate cosine distances between sentences.
+   *
+   * @param sentences - List of sentences to calculate distances for.
+   * @returns Tuple of distances and sentences.
+   */
+
+  const distances: number[] = []
+
+  for (let i = 0; i < sentences.length - 1; i++) {
+    const embeddingCurrent = sentences[i].combined_sentence_embedding
+    const embeddingNext = sentences[i + 1].combined_sentence_embedding
+
+    // Calculate cosine similarity
+    const similarity = cosineSimilarity(embeddingCurrent, embeddingNext)
+
+    // Convert to cosine distance
+    const distance = 1 - similarity
+
+    // Append cosine distance to the list
+    distances.push(distance)
+
+    // Store distance in the dictionary
+    sentences[i].distance_to_next = distance
+  }
+
+  return [distances, sentences]
+}
+
+// Utility function for cosine similarity
+function cosineSimilarity(vecA: number[], vecB: number[]) {
+  const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0)
+  const magnitudeA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0))
+  const magnitudeB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0))
+
+  return magnitudeA && magnitudeB ? dotProduct / (magnitudeA * magnitudeB) : 0
+}
+
+enum BreakpointThresholdType {
+  PERCENTILE = "percentile",
+  STANDARD_DEVIATION = "standard_deviation",
+  INTERQUARTILE = "interquartile",
+  GRADIENT = "gradient",
+}
+
+const BREAKPOINT_DEFAULTS: Record<BreakpointThresholdType, number> = {
+  percentile: 95,
+  standard_deviation: 3,
+  interquartile: 1.5,
+  gradient: 95,
+}
+
+interface SemanticChunkerOptions {
+  sentecesToCombine?: number
+  sentenceSplitRegex?: RegExp
+  addStartIndex?: boolean
+  breakpointThresholdType?: BreakpointThresholdType
+  breakpointThresholdAmount?: number
+  numberOfChunks?: number
+  minChunkSize?: number
+}
+
+export class SemanticChunker extends BaseDocumentTransformer {
+  private sentecesToCombine: number = 1
+  private sentenceSplitRegex: RegExp = new RegExp(`(?<=[.?!])\\s+`)
+  private addStartIndex: boolean = false
+  private breakpointThresholdType: BreakpointThresholdType = BreakpointThresholdType.PERCENTILE
+  private breakpointThresholdAmount: number = BREAKPOINT_DEFAULTS[this.breakpointThresholdType]
+  private numberOfChunks?: number
+  private minChunkSize?: number
+
+  constructor(
+    private embeddings: Embeddings,
+    options?: SemanticChunkerOptions
+  ) {
+    super()
+
+    if (options) {
+      if (options.sentecesToCombine !== undefined) this.sentecesToCombine = options.sentecesToCombine
+      if (options.sentenceSplitRegex !== undefined) this.sentenceSplitRegex = new RegExp(options.sentenceSplitRegex)
+      if (options.addStartIndex !== undefined) this.addStartIndex = options.addStartIndex
+      if (options.breakpointThresholdType !== undefined) {
+        this.breakpointThresholdType = options.breakpointThresholdType
+        this.breakpointThresholdAmount = BREAKPOINT_DEFAULTS[options.breakpointThresholdType]
+      }
+      if (options.breakpointThresholdAmount !== undefined) this.breakpointThresholdAmount = options.breakpointThresholdAmount
+      if (options.numberOfChunks !== undefined) this.numberOfChunks = options.numberOfChunks
+      if (options.minChunkSize !== undefined) this.minChunkSize = options.minChunkSize
+    }
+  }
+
+  // Utility functions
+  private mean(arr: number[]): number {
+    return arr.reduce((a, b) => a + b, 0) / arr.length
+  }
+
+  private std(arr: number[]): number {
+    const mean = this.mean(arr)
+    return Math.sqrt(arr.map((x) => (x - mean) ** 2).reduce((a, b) => a + b, 0) / arr.length)
+  }
+
+  private percentile(arr: number[], p: number): number {
+    const sorted = [...arr].sort((a, b) => a - b)
+    const index = Math.ceil((p / 100) * sorted.length) - 1
+    return sorted[Math.max(index, 0)]
+  }
+
+  private percentileValues(arr: number[], percentiles: number[]): number[] {
+    return percentiles.map((p) => this.percentile(arr, p))
+  }
+
+  private gradient(arr: number[]): number[] {
+    return arr.map((_, i, a) => (i === 0 ? a[i + 1] - a[i] : a[i] - a[i - 1])).slice(1)
+  }
+
+  private _calculateBreakpointThreshold(distances: number[]): [number, number[]] {
+    switch (this.breakpointThresholdType) {
+      case BreakpointThresholdType.PERCENTILE:
+        return [this.percentile(distances, this.breakpointThresholdAmount), distances]
+
+      case BreakpointThresholdType.STANDARD_DEVIATION:
+        return [this.mean(distances) + this.breakpointThresholdAmount * this.std(distances), distances]
+
+      case BreakpointThresholdType.INTERQUARTILE:
+        const [q1, q3] = this.percentileValues(distances, [25, 75])
+        const iqr = q3 - q1
+        return [this.mean(distances) + this.breakpointThresholdAmount * iqr, distances]
+
+      case BreakpointThresholdType.GRADIENT:
+        const distanceGradient = this.gradient(distances)
+        return [this.percentile(distanceGradient, this.breakpointThresholdAmount), distanceGradient]
+
+      default:
+        throw new Error(`Unexpected breakpointThresholdType: ${this.breakpointThresholdType}`)
+    }
+  }
+
+  private _thresholdFromClusters(distances: number[]): number {
+    if (this.numberOfChunks === undefined) {
+      throw new Error("This should never be called if `numberOfChunks` is undefined.")
+    }
+
+    const x1 = distances.length
+    const y1 = 0.0
+    const x2 = 1.0
+    const y2 = 100.0
+
+    const x = Math.max(Math.min(this.numberOfChunks, x1), x2)
+    const y = x2 === x1 ? y2 : y1 + ((y2 - y1) / (x2 - x1)) * (x - x1)
+
+    return this.percentile(distances, Math.min(Math.max(y, 0), 100))
+  }
+
+  private async _calculateSentenceDistances(singleSentencesList: string[]): Promise<[number[], any[]]> {
+    const sentences = combineSentences(
+      singleSentencesList.map((sentence) => ({ sentence })),
+      this.sentecesToCombine
+    )
+
+    const embeddings = await this.embeddings.embedDocuments(sentences.map((x) => x.combined_sentence))
+
+    sentences.forEach((sentence, i) => {
+      sentence.combined_sentence_embedding = embeddings[i]
+    })
+
+    return calculateCosineDistances(sentences as SentenceWithEmbedding[])
+  }
+
+  private _getSingleSentencesList(text: string): string[] {
+    return text.split(this.sentenceSplitRegex)
+  }
+
+  async splitText(text: string): Promise<string[]> {
+    const singleSentencesList = this._getSingleSentencesList(text)
+
+    if (singleSentencesList.length <= 1) return singleSentencesList
+
+    if (this.breakpointThresholdType === BreakpointThresholdType.GRADIENT && singleSentencesList.length === 2) {
+      return singleSentencesList
+    }
+
+    const [distances, sentences] = await this._calculateSentenceDistances(singleSentencesList)
+    let breakpointDistanceThreshold: number
+    let breakpointArray: number[]
+
+    if (this.numberOfChunks !== undefined) {
+      breakpointDistanceThreshold = this._thresholdFromClusters(distances)
+      breakpointArray = distances
+    } else {
+      ;[breakpointDistanceThreshold, breakpointArray] = this._calculateBreakpointThreshold(distances)
+    }
+
+    const indicesAboveThresh = breakpointArray.map((x, i) => (x > breakpointDistanceThreshold ? i : -1)).filter((i) => i !== -1)
+
+    const chunks: string[] = []
+    let startIndex = 0
+
+    for (const index of indicesAboveThresh) {
+      const endIndex = index
+      const group = sentences.slice(startIndex, endIndex + 1)
+      const combinedText = group.map((d) => d.sentence).join(" ")
+
+      if (this.minChunkSize !== undefined && combinedText.length < this.minChunkSize) continue
+      chunks.push(combinedText)
+      startIndex = index + 1
+    }
+
+    if (startIndex < sentences.length) {
+      chunks.push(
+        sentences
+          .slice(startIndex)
+          .map((d) => d.sentence)
+          .join(" ")
+      )
+    }
+
+    return chunks
+  }
+
+  async createDocuments(texts: string[], metadatas?: Record<string, any>[]): Promise<Document[]> {
+    const _metadatas = metadatas || Array(texts.length).fill({})
+    const documents: Document[] = []
+
+    for (const [i, text] of texts.entries()) {
+      let startIndex = 0
+      const chunks = await this.splitText(text)
+
+      chunks.forEach((chunk) => {
+        const metadata = { ..._metadatas[i] }
+
+        if (this.addStartIndex) metadata.start_index = startIndex
+
+        documents.push(new Document({ pageContent: chunk, metadata }))
+        startIndex += chunk.length
+      })
+    }
+
+    return documents
+  }
+
+  async splitDocuments(documents: Document[]): Promise<Document[]> {
+    const texts: string[] = []
+    const metadatas: Record<string, any>[] = []
+
+    for (const doc of documents) {
+      texts.push(doc.pageContent)
+      metadatas.push(doc.metadata)
+    }
+
+    return this.createDocuments(texts, metadatas)
+  }
+
+  async transformDocuments(documents: Document[]): Promise<Document[]> {
+    return this.splitDocuments(documents)
+  }
+}

From 6550f668cbdf2785ad9f0b05d4957fe22c35eba5 Mon Sep 17 00:00:00 2001
From: Daniel Arroyo <daniel@nieveconsulting.com>
Date: Mon, 17 Mar 2025 15:52:42 +0100
Subject: [PATCH 2/6] fix(experimental): type in file name

---
 .../text_splitters/{sematic_chunker.ts => semantic_chunker.ts}    | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename libs/langchain-community/src/experimental/text_splitters/{sematic_chunker.ts => semantic_chunker.ts} (100%)

diff --git a/libs/langchain-community/src/experimental/text_splitters/sematic_chunker.ts b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts
similarity index 100%
rename from libs/langchain-community/src/experimental/text_splitters/sematic_chunker.ts
rename to libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts

From 0e941769550a2e28aa27584c44d955843896ffa2 Mon Sep 17 00:00:00 2001
From: Daniel Arroyo <daniel@nieveconsulting.com>
Date: Mon, 31 Mar 2025 19:37:28 +0200
Subject: [PATCH 3/6] fix: Use existing cosineSimilarity function

---
 .../experimental/text_splitters/semantic_chunker.ts    | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts
index d585a69bc372..8df7541197c8 100644
--- a/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts
+++ b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts
@@ -1,5 +1,6 @@
 import { BaseDocumentTransformer, Document } from "@langchain/core/documents"
 import { Embeddings } from "@langchain/core/embeddings"
+import { cosineSimilarity } from "@langchain/core/utils/math";
 
 type Sentence = {
   sentence: string
@@ -77,15 +78,6 @@ function calculateCosineDistances(sentences: SentenceWithEmbedding[]): [number[]
   return [distances, sentences]
 }
 
-// Utility function for cosine similarity
-function cosineSimilarity(vecA: number[], vecB: number[]) {
-  const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0)
-  const magnitudeA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0))
-  const magnitudeB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0))
-
-  return magnitudeA && magnitudeB ? dotProduct / (magnitudeA * magnitudeB) : 0
-}
-
 enum BreakpointThresholdType {
   PERCENTILE = "percentile",
   STANDARD_DEVIATION = "standard_deviation",

From a69b8fcd7e67500075a5587911f41484741f2e1c Mon Sep 17 00:00:00 2001
From: Daniel Arroyo <daniel@nieveconsulting.com>
Date: Mon, 31 Mar 2025 19:42:00 +0200
Subject: [PATCH 4/6] fix: nit nit: senteces -> sentences

---
 .../text_splitters/semantic_chunker.ts           | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts
index 8df7541197c8..2acbe7342d9d 100644
--- a/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts
+++ b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts
@@ -12,12 +12,12 @@ type Sentence = {
 type SentenceWithCombinedSentence = Sentence & { combined_sentence: string }
 type SentenceWithEmbedding = Sentence & { combined_sentence_embedding: number[]; distance_to_next: number }
 
-function combineSentences(sentences: Sentence[], sentecesToCombine: number = 1) {
+function combineSentences(sentences: Sentence[], sentencesToCombine: number = 1) {
   /**
    * Combine sentences based on buffer size.
    *
    * @param sentences - List of sentences to combine.
-   * @param sentecesToCombine - Number of sentences to combine. Defaults to 1.
+   * @param sentencesToCombine - Number of sentences to combine. Defaults to 1.
    * @returns List of sentences with combined sentences.
    */
 
@@ -25,7 +25,7 @@ function combineSentences(sentences: Sentence[], sentecesToCombine: number = 1)
     let combinedSentence = ""
 
     // Add sentences before the current one
-    for (let j = i - sentecesToCombine; j < i; j++) {
+    for (let j = i - sentencesToCombine; j < i; j++) {
       if (j >= 0) {
         combinedSentence += sentences[j].sentence + " "
       }
@@ -35,7 +35,7 @@ function combineSentences(sentences: Sentence[], sentecesToCombine: number = 1)
     combinedSentence += sentences[i].sentence
 
     // Add sentences after the current one
-    for (let j = i + 1; j < i + 1 + sentecesToCombine; j++) {
+    for (let j = i + 1; j < i + 1 + sentencesToCombine; j++) {
       if (j < sentences.length) {
         combinedSentence += " " + sentences[j].sentence
       }
@@ -93,7 +93,7 @@ const BREAKPOINT_DEFAULTS: Record<BreakpointThresholdType, number> = {
 }
 
 interface SemanticChunkerOptions {
-  sentecesToCombine?: number
+  sentencesToCombine?: number
   sentenceSplitRegex?: RegExp
   addStartIndex?: boolean
   breakpointThresholdType?: BreakpointThresholdType
@@ -103,7 +103,7 @@ interface SemanticChunkerOptions {
 }
 
 export class SemanticChunker extends BaseDocumentTransformer {
-  private sentecesToCombine: number = 1
+  private sentencesToCombine: number = 1
   private sentenceSplitRegex: RegExp = new RegExp(`(?<=[.?!])\\s+`)
   private addStartIndex: boolean = false
   private breakpointThresholdType: BreakpointThresholdType = BreakpointThresholdType.PERCENTILE
@@ -118,7 +118,7 @@ export class SemanticChunker extends BaseDocumentTransformer {
     super()
 
     if (options) {
-      if (options.sentecesToCombine !== undefined) this.sentecesToCombine = options.sentecesToCombine
+      if (options.sentencesToCombine !== undefined) this.sentencesToCombine = options.sentencesToCombine
       if (options.sentenceSplitRegex !== undefined) this.sentenceSplitRegex = new RegExp(options.sentenceSplitRegex)
       if (options.addStartIndex !== undefined) this.addStartIndex = options.addStartIndex
       if (options.breakpointThresholdType !== undefined) {
@@ -196,7 +196,7 @@ export class SemanticChunker extends BaseDocumentTransformer {
   private async _calculateSentenceDistances(singleSentencesList: string[]): Promise<[number[], any[]]> {
     const sentences = combineSentences(
       singleSentencesList.map((sentence) => ({ sentence })),
-      this.sentecesToCombine
+      this.sentencesToCombine
     )
 
     const embeddings = await this.embeddings.embedDocuments(sentences.map((x) => x.combined_sentence))

From 1e369a3492be66a40f9a64658d040d730729ddf5 Mon Sep 17 00:00:00 2001
From: Daniel Arroyo <daniel@nieveconsulting.com>
Date: Mon, 31 Mar 2025 19:49:14 +0200
Subject: [PATCH 5/6] fix: execute yarn format

---
 .../text_splitters/semantic_chunker.ts        | 301 +++++++++++-------
 1 file changed, 180 insertions(+), 121 deletions(-)

diff --git a/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts
index 2acbe7342d9d..46322eb5d5a9 100644
--- a/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts
+++ b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts
@@ -1,18 +1,24 @@
-import { BaseDocumentTransformer, Document } from "@langchain/core/documents"
-import { Embeddings } from "@langchain/core/embeddings"
+import { BaseDocumentTransformer, Document } from "@langchain/core/documents";
+import { Embeddings } from "@langchain/core/embeddings";
 import { cosineSimilarity } from "@langchain/core/utils/math";
 
 type Sentence = {
-  sentence: string
-  combined_sentence_embedding?: number[]
-  combined_sentence?: string
-  distance_to_next?: number
-}
-
-type SentenceWithCombinedSentence = Sentence & { combined_sentence: string }
-type SentenceWithEmbedding = Sentence & { combined_sentence_embedding: number[]; distance_to_next: number }
-
-function combineSentences(sentences: Sentence[], sentencesToCombine: number = 1) {
+  sentence: string;
+  combined_sentence_embedding?: number[];
+  combined_sentence?: string;
+  distance_to_next?: number;
+};
+
+type SentenceWithCombinedSentence = Sentence & { combined_sentence: string };
+type SentenceWithEmbedding = Sentence & {
+  combined_sentence_embedding: number[];
+  distance_to_next: number;
+};
+
+function combineSentences(
+  sentences: Sentence[],
+  sentencesToCombine: number = 1
+) {
   /**
    * Combine sentences based on buffer size.
    *
@@ -22,33 +28,35 @@ function combineSentences(sentences: Sentence[], sentencesToCombine: number = 1)
    */
 
   for (let i = 0; i < sentences.length; i++) {
-    let combinedSentence = ""
+    let combinedSentence = "";
 
     // Add sentences before the current one
     for (let j = i - sentencesToCombine; j < i; j++) {
       if (j >= 0) {
-        combinedSentence += sentences[j].sentence + " "
+        combinedSentence += sentences[j].sentence + " ";
       }
     }
 
     // Add the current sentence
-    combinedSentence += sentences[i].sentence
+    combinedSentence += sentences[i].sentence;
 
     // Add sentences after the current one
     for (let j = i + 1; j < i + 1 + sentencesToCombine; j++) {
       if (j < sentences.length) {
-        combinedSentence += " " + sentences[j].sentence
+        combinedSentence += " " + sentences[j].sentence;
       }
     }
 
     // Store the combined sentence in the current sentence object
-    sentences[i].combined_sentence = combinedSentence
+    sentences[i].combined_sentence = combinedSentence;
   }
 
-  return sentences as SentenceWithCombinedSentence[]
+  return sentences as SentenceWithCombinedSentence[];
 }
 
-function calculateCosineDistances(sentences: SentenceWithEmbedding[]): [number[], SentenceWithEmbedding[]] {
+function calculateCosineDistances(
+  sentences: SentenceWithEmbedding[]
+): [number[], SentenceWithEmbedding[]] {
   /**
    * Calculate cosine distances between sentences.
    *
@@ -56,26 +64,26 @@ function calculateCosineDistances(sentences: SentenceWithEmbedding[]): [number[]
    * @returns Tuple of distances and sentences.
    */
 
-  const distances: number[] = []
+  const distances: number[] = [];
 
   for (let i = 0; i < sentences.length - 1; i++) {
-    const embeddingCurrent = sentences[i].combined_sentence_embedding
-    const embeddingNext = sentences[i + 1].combined_sentence_embedding
+    const embeddingCurrent = sentences[i].combined_sentence_embedding;
+    const embeddingNext = sentences[i + 1].combined_sentence_embedding;
 
     // Calculate cosine similarity
-    const similarity = cosineSimilarity(embeddingCurrent, embeddingNext)
+    const similarity = cosineSimilarity(embeddingCurrent, embeddingNext);
 
     // Convert to cosine distance
-    const distance = 1 - similarity
+    const distance = 1 - similarity;
 
     // Append cosine distance to the list
-    distances.push(distance)
+    distances.push(distance);
 
     // Store distance in the dictionary
-    sentences[i].distance_to_next = distance
+    sentences[i].distance_to_next = distance;
   }
 
-  return [distances, sentences]
+  return [distances, sentences];
 }
 
 enum BreakpointThresholdType {
@@ -90,161 +98,209 @@ const BREAKPOINT_DEFAULTS: Record<BreakpointThresholdType, number> = {
   standard_deviation: 3,
   interquartile: 1.5,
   gradient: 95,
-}
+};
 
 interface SemanticChunkerOptions {
-  sentencesToCombine?: number
-  sentenceSplitRegex?: RegExp
-  addStartIndex?: boolean
-  breakpointThresholdType?: BreakpointThresholdType
-  breakpointThresholdAmount?: number
-  numberOfChunks?: number
-  minChunkSize?: number
+  sentencesToCombine?: number;
+  sentenceSplitRegex?: RegExp;
+  addStartIndex?: boolean;
+  breakpointThresholdType?: BreakpointThresholdType;
+  breakpointThresholdAmount?: number;
+  numberOfChunks?: number;
+  minChunkSize?: number;
 }
 
 export class SemanticChunker extends BaseDocumentTransformer {
-  private sentencesToCombine: number = 1
-  private sentenceSplitRegex: RegExp = new RegExp(`(?<=[.?!])\\s+`)
-  private addStartIndex: boolean = false
-  private breakpointThresholdType: BreakpointThresholdType = BreakpointThresholdType.PERCENTILE
-  private breakpointThresholdAmount: number = BREAKPOINT_DEFAULTS[this.breakpointThresholdType]
-  private numberOfChunks?: number
-  private minChunkSize?: number
+  private sentencesToCombine: number = 1;
+  private sentenceSplitRegex: RegExp = new RegExp(`(?<=[.?!])\\s+`);
+  private addStartIndex: boolean = false;
+  private breakpointThresholdType: BreakpointThresholdType =
+    BreakpointThresholdType.PERCENTILE;
+  private breakpointThresholdAmount: number =
+    BREAKPOINT_DEFAULTS[this.breakpointThresholdType];
+  private numberOfChunks?: number;
+  private minChunkSize?: number;
 
   constructor(
     private embeddings: Embeddings,
     options?: SemanticChunkerOptions
   ) {
-    super()
+    super();
 
     if (options) {
-      if (options.sentencesToCombine !== undefined) this.sentencesToCombine = options.sentencesToCombine
-      if (options.sentenceSplitRegex !== undefined) this.sentenceSplitRegex = new RegExp(options.sentenceSplitRegex)
-      if (options.addStartIndex !== undefined) this.addStartIndex = options.addStartIndex
+      if (options.sentencesToCombine !== undefined)
+        this.sentencesToCombine = options.sentencesToCombine;
+      if (options.sentenceSplitRegex !== undefined)
+        this.sentenceSplitRegex = new RegExp(options.sentenceSplitRegex);
+      if (options.addStartIndex !== undefined)
+        this.addStartIndex = options.addStartIndex;
       if (options.breakpointThresholdType !== undefined) {
-        this.breakpointThresholdType = options.breakpointThresholdType
-        this.breakpointThresholdAmount = BREAKPOINT_DEFAULTS[options.breakpointThresholdType]
+        this.breakpointThresholdType = options.breakpointThresholdType;
+        this.breakpointThresholdAmount =
+          BREAKPOINT_DEFAULTS[options.breakpointThresholdType];
       }
-      if (options.breakpointThresholdAmount !== undefined) this.breakpointThresholdAmount = options.breakpointThresholdAmount
-      if (options.numberOfChunks !== undefined) this.numberOfChunks = options.numberOfChunks
-      if (options.minChunkSize !== undefined) this.minChunkSize = options.minChunkSize
+      if (options.breakpointThresholdAmount !== undefined)
+        this.breakpointThresholdAmount = options.breakpointThresholdAmount;
+      if (options.numberOfChunks !== undefined)
+        this.numberOfChunks = options.numberOfChunks;
+      if (options.minChunkSize !== undefined)
+        this.minChunkSize = options.minChunkSize;
     }
   }
 
   // Utility functions
   private mean(arr: number[]): number {
-    return arr.reduce((a, b) => a + b, 0) / arr.length
+    return arr.reduce((a, b) => a + b, 0) / arr.length;
   }
 
   private std(arr: number[]): number {
-    const mean = this.mean(arr)
-    return Math.sqrt(arr.map((x) => (x - mean) ** 2).reduce((a, b) => a + b, 0) / arr.length)
+    const mean = this.mean(arr);
+    return Math.sqrt(
+      arr.map((x) => (x - mean) ** 2).reduce((a, b) => a + b, 0) / arr.length
+    );
   }
 
   private percentile(arr: number[], p: number): number {
-    const sorted = [...arr].sort((a, b) => a - b)
-    const index = Math.ceil((p / 100) * sorted.length) - 1
-    return sorted[Math.max(index, 0)]
+    const sorted = [...arr].sort((a, b) => a - b);
+    const index = Math.ceil((p / 100) * sorted.length) - 1;
+    return sorted[Math.max(index, 0)];
   }
 
   private percentileValues(arr: number[], percentiles: number[]): number[] {
-    return percentiles.map((p) => this.percentile(arr, p))
+    return percentiles.map((p) => this.percentile(arr, p));
   }
 
   private gradient(arr: number[]): number[] {
-    return arr.map((_, i, a) => (i === 0 ? a[i + 1] - a[i] : a[i] - a[i - 1])).slice(1)
+    return arr
+      .map((_, i, a) => (i === 0 ? a[i + 1] - a[i] : a[i] - a[i - 1]))
+      .slice(1);
   }
 
-  private _calculateBreakpointThreshold(distances: number[]): [number, number[]] {
+  private _calculateBreakpointThreshold(
+    distances: number[]
+  ): [number, number[]] {
     switch (this.breakpointThresholdType) {
       case BreakpointThresholdType.PERCENTILE:
-        return [this.percentile(distances, this.breakpointThresholdAmount), distances]
+        return [
+          this.percentile(distances, this.breakpointThresholdAmount),
+          distances,
+        ];
 
       case BreakpointThresholdType.STANDARD_DEVIATION:
-        return [this.mean(distances) + this.breakpointThresholdAmount * this.std(distances), distances]
+        return [
+          this.mean(distances) +
+            this.breakpointThresholdAmount * this.std(distances),
+          distances,
+        ];
 
       case BreakpointThresholdType.INTERQUARTILE:
-        const [q1, q3] = this.percentileValues(distances, [25, 75])
-        const iqr = q3 - q1
-        return [this.mean(distances) + this.breakpointThresholdAmount * iqr, distances]
+        const [q1, q3] = this.percentileValues(distances, [25, 75]);
+        const iqr = q3 - q1;
+        return [
+          this.mean(distances) + this.breakpointThresholdAmount * iqr,
+          distances,
+        ];
 
       case BreakpointThresholdType.GRADIENT:
-        const distanceGradient = this.gradient(distances)
-        return [this.percentile(distanceGradient, this.breakpointThresholdAmount), distanceGradient]
+        const distanceGradient = this.gradient(distances);
+        return [
+          this.percentile(distanceGradient, this.breakpointThresholdAmount),
+          distanceGradient,
+        ];
 
       default:
-        throw new Error(`Unexpected breakpointThresholdType: ${this.breakpointThresholdType}`)
+        throw new Error(
+          `Unexpected breakpointThresholdType: ${this.breakpointThresholdType}`
+        );
     }
   }
 
   private _thresholdFromClusters(distances: number[]): number {
     if (this.numberOfChunks === undefined) {
-      throw new Error("This should never be called if `numberOfChunks` is undefined.")
+      throw new Error(
+        "This should never be called if `numberOfChunks` is undefined."
+      );
     }
 
-    const x1 = distances.length
-    const y1 = 0.0
-    const x2 = 1.0
-    const y2 = 100.0
+    const x1 = distances.length;
+    const y1 = 0.0;
+    const x2 = 1.0;
+    const y2 = 100.0;
 
-    const x = Math.max(Math.min(this.numberOfChunks, x1), x2)
-    const y = x2 === x1 ? y2 : y1 + ((y2 - y1) / (x2 - x1)) * (x - x1)
+    const x = Math.max(Math.min(this.numberOfChunks, x1), x2);
+    const y = x2 === x1 ? y2 : y1 + ((y2 - y1) / (x2 - x1)) * (x - x1);
 
-    return this.percentile(distances, Math.min(Math.max(y, 0), 100))
+    return this.percentile(distances, Math.min(Math.max(y, 0), 100));
   }
 
-  private async _calculateSentenceDistances(singleSentencesList: string[]): Promise<[number[], any[]]> {
+  private async _calculateSentenceDistances(
+    singleSentencesList: string[]
+  ): Promise<[number[], any[]]> {
     const sentences = combineSentences(
       singleSentencesList.map((sentence) => ({ sentence })),
       this.sentencesToCombine
-    )
+    );
 
-    const embeddings = await this.embeddings.embedDocuments(sentences.map((x) => x.combined_sentence))
+    const embeddings = await this.embeddings.embedDocuments(
+      sentences.map((x) => x.combined_sentence)
+    );
 
     sentences.forEach((sentence, i) => {
-      sentence.combined_sentence_embedding = embeddings[i]
-    })
+      sentence.combined_sentence_embedding = embeddings[i];
+    });
 
-    return calculateCosineDistances(sentences as SentenceWithEmbedding[])
+    return calculateCosineDistances(sentences as SentenceWithEmbedding[]);
   }
 
   private _getSingleSentencesList(text: string): string[] {
-    return text.split(this.sentenceSplitRegex)
+    return text.split(this.sentenceSplitRegex);
   }
 
   async splitText(text: string): Promise<string[]> {
-    const singleSentencesList = this._getSingleSentencesList(text)
+    const singleSentencesList = this._getSingleSentencesList(text);
 
-    if (singleSentencesList.length <= 1) return singleSentencesList
+    if (singleSentencesList.length <= 1) return singleSentencesList;
 
-    if (this.breakpointThresholdType === BreakpointThresholdType.GRADIENT && singleSentencesList.length === 2) {
-      return singleSentencesList
+    if (
+      this.breakpointThresholdType === BreakpointThresholdType.GRADIENT &&
+      singleSentencesList.length === 2
+    ) {
+      return singleSentencesList;
     }
 
-    const [distances, sentences] = await this._calculateSentenceDistances(singleSentencesList)
-    let breakpointDistanceThreshold: number
-    let breakpointArray: number[]
+    const [distances, sentences] = await this._calculateSentenceDistances(
+      singleSentencesList
+    );
+    let breakpointDistanceThreshold: number;
+    let breakpointArray: number[];
 
     if (this.numberOfChunks !== undefined) {
-      breakpointDistanceThreshold = this._thresholdFromClusters(distances)
-      breakpointArray = distances
+      breakpointDistanceThreshold = this._thresholdFromClusters(distances);
+      breakpointArray = distances;
     } else {
-      ;[breakpointDistanceThreshold, breakpointArray] = this._calculateBreakpointThreshold(distances)
+      [breakpointDistanceThreshold, breakpointArray] =
+        this._calculateBreakpointThreshold(distances);
     }
 
-    const indicesAboveThresh = breakpointArray.map((x, i) => (x > breakpointDistanceThreshold ? i : -1)).filter((i) => i !== -1)
+    const indicesAboveThresh = breakpointArray
+      .map((x, i) => (x > breakpointDistanceThreshold ? i : -1))
+      .filter((i) => i !== -1);
 
-    const chunks: string[] = []
-    let startIndex = 0
+    const chunks: string[] = [];
+    let startIndex = 0;
 
     for (const index of indicesAboveThresh) {
-      const endIndex = index
-      const group = sentences.slice(startIndex, endIndex + 1)
-      const combinedText = group.map((d) => d.sentence).join(" ")
+      const endIndex = index;
+      const group = sentences.slice(startIndex, endIndex + 1);
+      const combinedText = group.map((d) => d.sentence).join(" ");
 
-      if (this.minChunkSize !== undefined && combinedText.length < this.minChunkSize) continue
-      chunks.push(combinedText)
-      startIndex = index + 1
+      if (
+        this.minChunkSize !== undefined &&
+        combinedText.length < this.minChunkSize
+      )
+        continue;
+      chunks.push(combinedText);
+      startIndex = index + 1;
     }
 
     if (startIndex < sentences.length) {
@@ -253,46 +309,49 @@ export class SemanticChunker extends BaseDocumentTransformer {
           .slice(startIndex)
           .map((d) => d.sentence)
           .join(" ")
-      )
+      );
     }
 
-    return chunks
+    return chunks;
   }
 
-  async createDocuments(texts: string[], metadatas?: Record<string, any>[]): Promise<Document[]> {
-    const _metadatas = metadatas || Array(texts.length).fill({})
-    const documents: Document[] = []
+  async createDocuments(
+    texts: string[],
+    metadatas?: Record<string, any>[]
+  ): Promise<Document[]> {
+    const _metadatas = metadatas || Array(texts.length).fill({});
+    const documents: Document[] = [];
 
     for (const [i, text] of texts.entries()) {
-      let startIndex = 0
-      const chunks = await this.splitText(text)
+      let startIndex = 0;
+      const chunks = await this.splitText(text);
 
       chunks.forEach((chunk) => {
-        const metadata = { ..._metadatas[i] }
+        const metadata = { ..._metadatas[i] };
 
-        if (this.addStartIndex) metadata.start_index = startIndex
+        if (this.addStartIndex) metadata.start_index = startIndex;
 
-        documents.push(new Document({ pageContent: chunk, metadata }))
-        startIndex += chunk.length
-      })
+        documents.push(new Document({ pageContent: chunk, metadata }));
+        startIndex += chunk.length;
+      });
     }
 
-    return documents
+    return documents;
   }
 
   async splitDocuments(documents: Document[]): Promise<Document[]> {
-    const texts: string[] = []
-    const metadatas: Record<string, any>[] = []
+    const texts: string[] = [];
+    const metadatas: Record<string, any>[] = [];
 
     for (const doc of documents) {
-      texts.push(doc.pageContent)
-      metadatas.push(doc.metadata)
+      texts.push(doc.pageContent);
+      metadatas.push(doc.metadata);
     }
 
-    return this.createDocuments(texts, metadatas)
+    return this.createDocuments(texts, metadatas);
   }
 
   async transformDocuments(documents: Document[]): Promise<Document[]> {
-    return this.splitDocuments(documents)
+    return this.splitDocuments(documents);
   }
 }

From 36cb00f896dbd284f527d9fb52e1ad92a305f8a1 Mon Sep 17 00:00:00 2001
From: Daniel Arroyo <daniel@nieveconsulting.com>
Date: Tue, 22 Apr 2025 20:51:02 +0200
Subject: [PATCH 6/6] test: add unit tests for semantic chunker

---
 .../text_splitters/semantic_chunker.ts        |  14 +-
 .../tests/semantic_chunker.test.ts            | 126 ++++++++++++++++++
 2 files changed, 137 insertions(+), 3 deletions(-)
 create mode 100644 libs/langchain-community/src/experimental/text_splitters/tests/semantic_chunker.test.ts

diff --git a/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts
index 46322eb5d5a9..7271faaeaabf 100644
--- a/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts
+++ b/libs/langchain-community/src/experimental/text_splitters/semantic_chunker.ts
@@ -1,6 +1,5 @@
 import { BaseDocumentTransformer, Document } from "@langchain/core/documents";
 import { Embeddings } from "@langchain/core/embeddings";
-import { cosineSimilarity } from "@langchain/core/utils/math";
 
 type Sentence = {
   sentence: string;
@@ -15,7 +14,16 @@ type SentenceWithEmbedding = Sentence & {
   distance_to_next: number;
 };
 
-function combineSentences(
+// Utility function for cosine similarity
+function cosineSimilarity(vecA: number[], vecB: number[]) {
+  const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0)
+  const magnitudeA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0))
+  const magnitudeB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0))
+
+  return magnitudeA && magnitudeB ? dotProduct / (magnitudeA * magnitudeB) : 0
+}
+
+export function combineSentences(
   sentences: Sentence[],
   sentencesToCombine: number = 1
 ) {
@@ -54,7 +62,7 @@ function combineSentences(
   return sentences as SentenceWithCombinedSentence[];
 }
 
-function calculateCosineDistances(
+export function calculateCosineDistances(
   sentences: SentenceWithEmbedding[]
 ): [number[], SentenceWithEmbedding[]] {
   /**
diff --git a/libs/langchain-community/src/experimental/text_splitters/tests/semantic_chunker.test.ts b/libs/langchain-community/src/experimental/text_splitters/tests/semantic_chunker.test.ts
new file mode 100644
index 000000000000..0d6c0100d839
--- /dev/null
+++ b/libs/langchain-community/src/experimental/text_splitters/tests/semantic_chunker.test.ts
@@ -0,0 +1,126 @@
+import {
+  combineSentences,
+  calculateCosineDistances,
+} from "../semantic_chunker.js";
+
+describe("combineSentences", () => {
+  test("combines sentences with default sentencesToCombine (1)", () => {
+    const sentences = [
+      { sentence: "This is sentence one." },
+      { sentence: "This is sentence two." },
+      { sentence: "This is sentence three." },
+    ];
+
+    const result = combineSentences(sentences);
+
+    expect(result).toEqual([
+      {
+        sentence: "This is sentence one.",
+        combined_sentence: "This is sentence one. This is sentence two.",
+      },
+      {
+        sentence: "This is sentence two.",
+        combined_sentence:
+          "This is sentence one. This is sentence two. This is sentence three.",
+      },
+      {
+        sentence: "This is sentence three.",
+        combined_sentence: "This is sentence two. This is sentence three.",
+      },
+    ]);
+  });
+
+  test("handles an empty array of sentences", () => {
+    const sentences: any[] = [];
+
+    const result = combineSentences(sentences);
+
+    expect(result).toEqual([]);
+  });
+
+  test("handles a single sentence", () => {
+    const sentences = [{ sentence: "Only one sentence here." }];
+
+    const result = combineSentences(sentences);
+
+    expect(result).toEqual([
+      {
+        sentence: "Only one sentence here.",
+        combined_sentence: "Only one sentence here.",
+      },
+    ]);
+  });
+});
+
+describe("calculateCosineDistances", () => {
+  test("calculates cosine distances between sentence embeddings", () => {
+    const sentences = [
+      {
+        sentence: "Sentence one.",
+        combined_sentence_embedding: [1, 0, 0],
+        distance_to_next: 0,
+      },
+      {
+        sentence: "Sentence two.",
+        combined_sentence_embedding: [0, 1, 0],
+        distance_to_next: 0,
+      },
+      {
+        sentence: "Sentence three.",
+        combined_sentence_embedding: [0, 0, 1],
+        distance_to_next: 0,
+      },
+    ];
+
+    const [distances, updatedSentences] = calculateCosineDistances(sentences);
+
+    expect(distances).toEqual([1, 1]);
+    expect(updatedSentences).toEqual([
+      {
+        sentence: "Sentence one.",
+        combined_sentence_embedding: [1, 0, 0],
+        distance_to_next: 1,
+      },
+      {
+        sentence: "Sentence two.",
+        combined_sentence_embedding: [0, 1, 0],
+        distance_to_next: 1,
+      },
+      {
+        sentence: "Sentence three.",
+        combined_sentence_embedding: [0, 0, 1],
+        distance_to_next: 0,
+      },
+    ]);
+  });
+
+  test("handles a single sentence with no distances to calculate", () => {
+    const sentences = [
+      {
+        sentence: "Only one sentence.",
+        combined_sentence_embedding: [1, 0, 0],
+        distance_to_next: 0,
+      },
+    ];
+
+    const [distances, updatedSentences] = calculateCosineDistances(sentences);
+
+    expect(distances).toEqual([]);
+    expect(updatedSentences).toEqual([
+      {
+        sentence: "Only one sentence.",
+        combined_sentence_embedding: [1, 0, 0],
+        distance_to_next: 0,
+      },
+    ]);
+  });
+
+  test("handles an empty array of sentences", () => {
+    const sentences: any[] = [];
+
+    const [distances, updatedSentences] = calculateCosineDistances(sentences);
+
+    expect(distances).toEqual([]);
+    expect(updatedSentences).toEqual([]);
+  });
+});