Skip to content

Commit c951b6e

Browse files
committed
Changes according to CR
1 parent 25a7f76 commit c951b6e

File tree

1 file changed

+31
-38
lines changed

1 file changed

+31
-38
lines changed

fplus-lib/src/external_services/similarity_detection.rs

Lines changed: 31 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -80,19 +80,10 @@ pub async fn detect_similar_applications(
8080
let similar_data_set_sample = get_similar_texts_levenshtein(&data_set_samples)?;
8181

8282
let unique_addresses: HashSet<String> = similar_project_desciptions
83-
.unwrap_or_default()
8483
.into_iter()
85-
.chain(
86-
similar_stored_data_desciptions
87-
.unwrap_or_default()
88-
.into_iter(),
89-
)
90-
.chain(
91-
similar_project_and_stored_data_desciptions
92-
.unwrap_or_default()
93-
.into_iter(),
94-
)
95-
.chain(similar_data_set_sample.unwrap_or_default().into_iter())
84+
.chain(similar_stored_data_desciptions.into_iter())
85+
.chain(similar_project_and_stored_data_desciptions.into_iter())
86+
.chain(similar_data_set_sample.into_iter())
9687
.collect();
9788
let unique_addresses: Vec<String> = unique_addresses.into_iter().collect();
9889

@@ -102,19 +93,20 @@ pub async fn detect_similar_applications(
10293
Ok(())
10394
}
10495

105-
fn get_similar_texts_tfidf(documents: &Vec<Document>) -> Result<Option<Vec<String>>, LDNError> {
106-
let mut tokenized_documents = Vec::new();
107-
for doc in documents {
108-
tokenized_documents.push(tfidf_summarizer::tokenize(&doc.text));
109-
}
96+
fn get_similar_texts_tfidf(documents: &[Document]) -> Result<Vec<String>, LDNError> {
97+
let tokenized_documents: Vec<Vec<String>> = documents
98+
.iter()
99+
.map(|doc| tfidf_summarizer::tokenize(&doc.text))
100+
.collect();
101+
110102
let df = tfidf_summarizer::document_frequency(&tokenized_documents);
111103
let documents_words: Vec<String> = df.keys().cloned().collect();
112104
let idf = tfidf_summarizer::inverse_document_frequency(&df, tokenized_documents.len());
113-
let mut tfidf_result = Vec::new();
105+
let tfidf_result: Vec<HashMap<String, f64>> = tokenized_documents
106+
.iter()
107+
.map(|tokens| tfidf_summarizer::tf_idf(tokens.clone(), &idf))
108+
.collect();
114109

115-
for tokens in tokenized_documents.iter() {
116-
tfidf_result.push(tfidf_summarizer::tf_idf(tokens.clone(), &idf));
117-
}
118110
let documents_converted_to_array = convert_to_ndarray(&tfidf_result, &documents_words);
119111
let mut similar_applications: Vec<String> = Vec::new();
120112
let tfidf_threshold = get_env_var_or_default("TFIDF_THRESHOLD")
@@ -129,28 +121,29 @@ fn get_similar_texts_tfidf(documents: &Vec<Document>) -> Result<Option<Vec<Strin
129121
similar_applications.push(documents[i].client_address.clone());
130122
}
131123
}
132-
if similar_applications.is_empty() {
133-
return Ok(None);
134-
}
135-
Ok(Some(similar_applications))
124+
125+
Ok(similar_applications)
136126
}
137127

138-
fn get_similar_texts_levenshtein(documents: &[Document]) -> Result<Option<Vec<String>>, LDNError> {
128+
fn get_similar_texts_levenshtein(documents: &[Document]) -> Result<Vec<String>, LDNError> {
139129
let mut similar_texts = Vec::new();
140130
let levenshtein_threshold = get_env_var_or_default("LEVENSHTEIN_THRESHOLD")
141131
.parse::<usize>()
142-
.map_err(|e| LDNError::New(format!("Parse tfidf threshold score to f64 failed: {}", e)))?;
143-
for i in 1..documents.len() {
144-
let similarity = levenshtein(&documents[0].text, &documents[i].text);
145-
146-
if similarity < levenshtein_threshold {
147-
similar_texts.push(documents[i].client_address.clone());
148-
}
149-
}
150-
if similar_texts.is_empty() {
151-
return Ok(None);
152-
}
153-
Ok(Some(similar_texts))
132+
.map_err(|e| {
133+
LDNError::New(format!(
134+
"Parse tfidf threshold score to usize failed: {}",
135+
e
136+
))
137+
})?;
138+
similar_texts.extend(
139+
documents
140+
.iter()
141+
.skip(1)
142+
.filter(|doc| levenshtein(&documents[0].text, &doc.text) < levenshtein_threshold)
143+
.map(|doc| doc.client_address.clone()),
144+
);
145+
146+
Ok(similar_texts)
154147
}
155148

156149
fn convert_to_ndarray(

0 commit comments

Comments
 (0)