Skip to content

Revised "Revert "Generalisation replace_groups to wider class of cases"" #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 124 additions & 48 deletions extractor.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,40 @@
from number import NUMBER
from natasha.extractors import Extractor
from yargy.parser import Match
import math


class NumberExtractor(Extractor):
def __init__(self):
super(NumberExtractor, self).__init__(NUMBER)


def n_digits(self, n):
if n > 0:
digits = int(math.log10(n)) + 1
elif n == 0:
digits = 1
else:
digits = int(math.log10(-n)) + 2 # +1 if you don't count the '-'

return digits


def trailing_zeros(self, n: int):
"""
Count trailing zeros of a number

Args:
n: number

Result:
cnt: count of zeros
"""
cnt = 0
while n % 10 == 0 and n != 0:
cnt += 1
n = n / 10
return cnt

def replace(self, text):
"""
Замена чисел в тексте без их группировки
Expand Down Expand Up @@ -35,7 +64,28 @@ def replace(self, text):
return new_text
else:
return None


def _get_groups(self, text):
start = 0
matches = list(self.parser.findall(text))
groups = []
group_matches = []

for i, match in enumerate(matches):
if i == 0:
start = match.span.start
if i == len(matches) - 1:
next_match = match
else:
next_match = matches[i + 1]
group_matches.append(match.fact)
if text[match.span.stop: next_match.span.start].strip() or next_match == match:
groups.append((group_matches, start, match.span.stop))
group_matches = []
start = next_match.span.start
return groups


def replace_groups(self, text):
"""
Замена сгруппированных составных чисел в тексте
Expand All @@ -46,52 +96,78 @@ def replace_groups(self, text):
Результат:
new_text: текст с замененными числами
"""
if text:
start = 0
matches = list(self.parser.findall(text))
groups = []
group_matches = []

for i, match in enumerate(matches):
if i == 0:
start = match.span.start
if i == len(matches) - 1:
next_match = match

groups = self._get_groups(text)

new_text = ""
start = 0
for group in groups:
num = 0
nums = []
new_text += text[start: group[1]]
for match in group[0]:
curr_num = match.int * match.multiplier if match.multiplier else match.int
if match.multiplier:
num = (num + match.int) * match.multiplier
nums.append(num)
num = 0
elif num > curr_num or num == 0:
num += curr_num
else:
next_match = matches[i + 1]
group_matches.append(match.fact)
if text[match.span.stop: next_match.span.start].strip() or next_match == match:
groups.append((group_matches, start, match.span.stop))
group_matches = []
start = next_match.span.start

new_text = ""
start = 0

for group in groups:
num = 0
nums = []
new_text += text[start: group[1]]
for match in group[0]:
curr_num = match.int * match.multiplier if match.multiplier else match.int
if match.multiplier:
num = (num + match.int) * match.multiplier
nums.append(num)
num = 0
elif num > curr_num or num == 0:
num += curr_num
else:
nums.append(num)
num = 0
if num > 0:
nums.append(num)
new_text += str(sum(nums))
start = group[2]
new_text += text[start:]
num = 0
if num > 0:
nums.append(num)
new_text += str(sum(nums))
start = group[2]
new_text += text[start:]

if start == 0:
return text
else:
return new_text
else:
return None
return new_text


def replace_groups_sa(self, text):
"""
Замена сгруппированных составных чисел в тексте и отдельно стоящих чисел без их суммирования

Аргументы:
text: исходный текст

Результат:
new_text: текст с замененными числами
"""
groups = self._get_groups(text)
new_text = ''
start = 0
for group in groups:
new_text += text[start: group[1]]

nums = []
prev_tz = 0
prev_mult = None
for match in group[0]:
mult = match.multiplier if match.multiplier else 1
curr_num = match.int
tz = self.trailing_zeros(curr_num)
if (tz < prev_tz) and (mult >= prev_mult) and curr_num != 0 and (self.n_digits(curr_num) < self.n_digits(nums[0][0])):
nums[0] = (nums[0][0] + curr_num, mult)
else:
nums.insert(0, (curr_num, mult))
prev_mult = mult
prev_tz = tz

prev_mult = None
new_nums = []
for num, mult in nums:
if not prev_mult or mult <= prev_mult:
new_nums.append(num * mult)
else:
new_nums[-1] += num * mult
prev_mult = mult

new_nums.reverse()
new_text += ' '.join(map(str, new_nums))
start = group[2]

new_text += text[start:]

return new_text