Changes according to CR

Filip-L · Filip-L · commit c951b6ea38c5 · 2025-02-20T13:12:21.000+01:00
diff --git a/fplus-lib/src/external_services/similarity_detection.rs b/fplus-lib/src/external_services/similarity_detection.rs
@@ -80,19 +80,10 @@ pub async fn detect_similar_applications(
     let similar_data_set_sample = get_similar_texts_levenshtein(&data_set_samples)?;
 
     let unique_addresses: HashSet<String> = similar_project_desciptions
-        .unwrap_or_default()
         .into_iter()
-        .chain(
-            similar_stored_data_desciptions
-                .unwrap_or_default()
-                .into_iter(),
-        )
-        .chain(
-            similar_project_and_stored_data_desciptions
-                .unwrap_or_default()
-                .into_iter(),
-        )
-        .chain(similar_data_set_sample.unwrap_or_default().into_iter())
+        .chain(similar_stored_data_desciptions.into_iter())
+        .chain(similar_project_and_stored_data_desciptions.into_iter())
+        .chain(similar_data_set_sample.into_iter())
         .collect();
     let unique_addresses: Vec<String> = unique_addresses.into_iter().collect();
 
@@ -102,19 +93,20 @@ pub async fn detect_similar_applications(
     Ok(())
 }
 
-fn get_similar_texts_tfidf(documents: &Vec<Document>) -> Result<Option<Vec<String>>, LDNError> {
-    let mut tokenized_documents = Vec::new();
-    for doc in documents {
-        tokenized_documents.push(tfidf_summarizer::tokenize(&doc.text));
-    }
+fn get_similar_texts_tfidf(documents: &[Document]) -> Result<Vec<String>, LDNError> {
+    let tokenized_documents: Vec<Vec<String>> = documents
+        .iter()
+        .map(|doc| tfidf_summarizer::tokenize(&doc.text))
+        .collect();
+
     let df = tfidf_summarizer::document_frequency(&tokenized_documents);
     let documents_words: Vec<String> = df.keys().cloned().collect();
     let idf = tfidf_summarizer::inverse_document_frequency(&df, tokenized_documents.len());
-    let mut tfidf_result = Vec::new();
+    let tfidf_result: Vec<HashMap<String, f64>> = tokenized_documents
+        .iter()
+        .map(|tokens| tfidf_summarizer::tf_idf(tokens.clone(), &idf))
+        .collect();
 
-    for tokens in tokenized_documents.iter() {
-        tfidf_result.push(tfidf_summarizer::tf_idf(tokens.clone(), &idf));
-    }
     let documents_converted_to_array = convert_to_ndarray(&tfidf_result, &documents_words);
     let mut similar_applications: Vec<String> = Vec::new();
     let tfidf_threshold = get_env_var_or_default("TFIDF_THRESHOLD")
@@ -129,28 +121,29 @@ fn get_similar_texts_tfidf(documents: &Vec<Document>) -> Result<Option<Vec<Strin
             similar_applications.push(documents[i].client_address.clone());
         }
     }
-    if similar_applications.is_empty() {
-        return Ok(None);
-    }
-    Ok(Some(similar_applications))
+
+    Ok(similar_applications)
 }
 
-fn get_similar_texts_levenshtein(documents: &[Document]) -> Result<Option<Vec<String>>, LDNError> {
+fn get_similar_texts_levenshtein(documents: &[Document]) -> Result<Vec<String>, LDNError> {
     let mut similar_texts = Vec::new();
     let levenshtein_threshold = get_env_var_or_default("LEVENSHTEIN_THRESHOLD")
         .parse::<usize>()
-        .map_err(|e| LDNError::New(format!("Parse tfidf threshold score to f64 failed: {}", e)))?;
-    for i in 1..documents.len() {
-        let similarity = levenshtein(&documents[0].text, &documents[i].text);
-
-        if similarity < levenshtein_threshold {
-            similar_texts.push(documents[i].client_address.clone());
-        }
-    }
-    if similar_texts.is_empty() {
-        return Ok(None);
-    }
-    Ok(Some(similar_texts))
+        .map_err(|e| {
+            LDNError::New(format!(
+                "Parse tfidf threshold score to usize failed: {}",
+                e
+            ))
+        })?;
+    similar_texts.extend(
+        documents
+            .iter()
+            .skip(1)
+            .filter(|doc| levenshtein(&documents[0].text, &doc.text) < levenshtein_threshold)
+            .map(|doc| doc.client_address.clone()),
+    );
+
+    Ok(similar_texts)
 }
 
 fn convert_to_ndarray(