5
5
from nltk .tokenize import sent_tokenize
6
6
from omniparse .web .model_loader import load_nltk_punkt
7
7
8
+
8
9
# Define the abstract base class for chunking strategies
9
10
class ChunkingStrategy (ABC ):
10
-
11
11
@abstractmethod
12
12
def chunk (self , text : str ) -> list :
13
13
"""
14
14
Abstract method to chunk the given text.
15
15
"""
16
16
pass
17
-
17
+
18
+
18
19
# Regex-based chunking
19
20
class RegexChunking (ChunkingStrategy ):
20
21
def __init__ (self , patterns = None , ** kwargs ):
21
22
if patterns is None :
22
- patterns = [r' \n\n' ] # Default split pattern
23
+ patterns = [r" \n\n" ] # Default split pattern
23
24
self .patterns = patterns
24
25
25
26
def chunk (self , text : str ) -> list :
@@ -30,24 +31,26 @@ def chunk(self, text: str) -> list:
30
31
new_paragraphs .extend (re .split (pattern , paragraph ))
31
32
paragraphs = new_paragraphs
32
33
return paragraphs
33
-
34
- # NLP-based sentence chunking
34
+
35
+
36
+ # NLP-based sentence chunking
35
37
class NlpSentenceChunking (ChunkingStrategy ):
36
38
def __init__ (self , ** kwargs ):
37
39
load_nltk_punkt ()
38
40
pass
39
41
40
- def chunk (self , text : str ) -> list :
42
+ def chunk (self , text : str ) -> list :
41
43
sentences = sent_tokenize (text )
42
- sens = [sent .strip () for sent in sentences ]
43
-
44
+ sens = [sent .strip () for sent in sentences ]
45
+
44
46
return list (set (sens ))
45
-
47
+
48
+
46
49
# Topic-based segmentation using TextTiling
47
50
class TopicSegmentationChunking (ChunkingStrategy ):
48
-
49
51
def __init__ (self , num_keywords = 3 , ** kwargs ):
50
52
import nltk as nl
53
+
51
54
self .tokenizer = nl .toknize .TextTilingTokenizer ()
52
55
self .num_keywords = num_keywords
53
56
@@ -59,8 +62,14 @@ def chunk(self, text: str) -> list:
59
62
def extract_keywords (self , text : str ) -> list :
60
63
# Tokenize and remove stopwords and punctuation
61
64
import nltk as nl
65
+
62
66
tokens = nl .toknize .word_tokenize (text )
63
- tokens = [token .lower () for token in tokens if token not in nl .corpus .stopwords .words ('english' ) and token not in string .punctuation ]
67
+ tokens = [
68
+ token .lower ()
69
+ for token in tokens
70
+ if token not in nl .corpus .stopwords .words ("english" )
71
+ and token not in string .punctuation
72
+ ]
64
73
65
74
# Calculate frequency distribution
66
75
freq_dist = Counter (tokens )
@@ -71,18 +80,25 @@ def chunk_with_topics(self, text: str) -> list:
71
80
# Segment the text into topics
72
81
segments = self .chunk (text )
73
82
# Extract keywords for each topic segment
74
- segments_with_topics = [(segment , self .extract_keywords (segment )) for segment in segments ]
83
+ segments_with_topics = [
84
+ (segment , self .extract_keywords (segment )) for segment in segments
85
+ ]
75
86
return segments_with_topics
76
-
87
+
88
+
77
89
# Fixed-length word chunks
78
90
class FixedLengthWordChunking (ChunkingStrategy ):
79
91
def __init__ (self , chunk_size = 100 , ** kwargs ):
80
92
self .chunk_size = chunk_size
81
93
82
94
def chunk (self , text : str ) -> list :
83
95
words = text .split ()
84
- return [' ' .join (words [i :i + self .chunk_size ]) for i in range (0 , len (words ), self .chunk_size )]
85
-
96
+ return [
97
+ " " .join (words [i : i + self .chunk_size ])
98
+ for i in range (0 , len (words ), self .chunk_size )
99
+ ]
100
+
101
+
86
102
# Sliding window chunking
87
103
class SlidingWindowChunking (ChunkingStrategy ):
88
104
def __init__ (self , window_size = 100 , step = 50 , ** kwargs ):
@@ -93,7 +109,5 @@ def chunk(self, text: str) -> list:
93
109
words = text .split ()
94
110
chunks = []
95
111
for i in range (0 , len (words ), self .step ):
96
- chunks .append (' ' .join (words [i : i + self .window_size ]))
112
+ chunks .append (" " .join (words [i : i + self .window_size ]))
97
113
return chunks
98
-
99
-
0 commit comments