@@ -80,19 +80,10 @@ pub async fn detect_similar_applications(
80
80
let similar_data_set_sample = get_similar_texts_levenshtein ( & data_set_samples) ?;
81
81
82
82
let unique_addresses: HashSet < String > = similar_project_desciptions
83
- . unwrap_or_default ( )
84
83
. into_iter ( )
85
- . chain (
86
- similar_stored_data_desciptions
87
- . unwrap_or_default ( )
88
- . into_iter ( ) ,
89
- )
90
- . chain (
91
- similar_project_and_stored_data_desciptions
92
- . unwrap_or_default ( )
93
- . into_iter ( ) ,
94
- )
95
- . chain ( similar_data_set_sample. unwrap_or_default ( ) . into_iter ( ) )
84
+ . chain ( similar_stored_data_desciptions. into_iter ( ) )
85
+ . chain ( similar_project_and_stored_data_desciptions. into_iter ( ) )
86
+ . chain ( similar_data_set_sample. into_iter ( ) )
96
87
. collect ( ) ;
97
88
let unique_addresses: Vec < String > = unique_addresses. into_iter ( ) . collect ( ) ;
98
89
@@ -102,19 +93,20 @@ pub async fn detect_similar_applications(
102
93
Ok ( ( ) )
103
94
}
104
95
105
- fn get_similar_texts_tfidf ( documents : & Vec < Document > ) -> Result < Option < Vec < String > > , LDNError > {
106
- let mut tokenized_documents = Vec :: new ( ) ;
107
- for doc in documents {
108
- tokenized_documents. push ( tfidf_summarizer:: tokenize ( & doc. text ) ) ;
109
- }
96
+ fn get_similar_texts_tfidf ( documents : & [ Document ] ) -> Result < Vec < String > , LDNError > {
97
+ let tokenized_documents: Vec < Vec < String > > = documents
98
+ . iter ( )
99
+ . map ( |doc| tfidf_summarizer:: tokenize ( & doc. text ) )
100
+ . collect ( ) ;
101
+
110
102
let df = tfidf_summarizer:: document_frequency ( & tokenized_documents) ;
111
103
let documents_words: Vec < String > = df. keys ( ) . cloned ( ) . collect ( ) ;
112
104
let idf = tfidf_summarizer:: inverse_document_frequency ( & df, tokenized_documents. len ( ) ) ;
113
- let mut tfidf_result = Vec :: new ( ) ;
105
+ let tfidf_result: Vec < HashMap < String , f64 > > = tokenized_documents
106
+ . iter ( )
107
+ . map ( |tokens| tfidf_summarizer:: tf_idf ( tokens. clone ( ) , & idf) )
108
+ . collect ( ) ;
114
109
115
- for tokens in tokenized_documents. iter ( ) {
116
- tfidf_result. push ( tfidf_summarizer:: tf_idf ( tokens. clone ( ) , & idf) ) ;
117
- }
118
110
let documents_converted_to_array = convert_to_ndarray ( & tfidf_result, & documents_words) ;
119
111
let mut similar_applications: Vec < String > = Vec :: new ( ) ;
120
112
let tfidf_threshold = get_env_var_or_default ( "TFIDF_THRESHOLD" )
@@ -129,28 +121,29 @@ fn get_similar_texts_tfidf(documents: &Vec<Document>) -> Result<Option<Vec<Strin
129
121
similar_applications. push ( documents[ i] . client_address . clone ( ) ) ;
130
122
}
131
123
}
132
- if similar_applications. is_empty ( ) {
133
- return Ok ( None ) ;
134
- }
135
- Ok ( Some ( similar_applications) )
124
+
125
+ Ok ( similar_applications)
136
126
}
137
127
138
- fn get_similar_texts_levenshtein ( documents : & [ Document ] ) -> Result < Option < Vec < String > > , LDNError > {
128
+ fn get_similar_texts_levenshtein ( documents : & [ Document ] ) -> Result < Vec < String > , LDNError > {
139
129
let mut similar_texts = Vec :: new ( ) ;
140
130
let levenshtein_threshold = get_env_var_or_default ( "LEVENSHTEIN_THRESHOLD" )
141
131
. parse :: < usize > ( )
142
- . map_err ( |e| LDNError :: New ( format ! ( "Parse tfidf threshold score to f64 failed: {}" , e) ) ) ?;
143
- for i in 1 ..documents. len ( ) {
144
- let similarity = levenshtein ( & documents[ 0 ] . text , & documents[ i] . text ) ;
145
-
146
- if similarity < levenshtein_threshold {
147
- similar_texts. push ( documents[ i] . client_address . clone ( ) ) ;
148
- }
149
- }
150
- if similar_texts. is_empty ( ) {
151
- return Ok ( None ) ;
152
- }
153
- Ok ( Some ( similar_texts) )
132
+ . map_err ( |e| {
133
+ LDNError :: New ( format ! (
134
+ "Parse tfidf threshold score to usize failed: {}" ,
135
+ e
136
+ ) )
137
+ } ) ?;
138
+ similar_texts. extend (
139
+ documents
140
+ . iter ( )
141
+ . skip ( 1 )
142
+ . filter ( |doc| levenshtein ( & documents[ 0 ] . text , & doc. text ) < levenshtein_threshold)
143
+ . map ( |doc| doc. client_address . clone ( ) ) ,
144
+ ) ;
145
+
146
+ Ok ( similar_texts)
154
147
}
155
148
156
149
fn convert_to_ndarray (
0 commit comments