From 1e6945407df3ff820346b9bacf41d9bd18e3c7b0 Mon Sep 17 00:00:00 2001 From: Trim21 Date: Fri, 22 Aug 2025 23:53:12 +0800 Subject: [PATCH 1/2] fix: should not search duplicated entries in different type --- beangulp/extract.py | 17 ++++++++++------- beangulp/extract_test.py | 3 +++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/beangulp/extract.py b/beangulp/extract.py index 58224bc..66e302e 100644 --- a/beangulp/extract.py +++ b/beangulp/extract.py @@ -1,6 +1,7 @@ __copyright__ = "Copyright (C) 2016-2017 Martin Blais" __license__ = "GNU GPLv2" +from collections import defaultdict import io import bisect import datetime @@ -8,7 +9,7 @@ import textwrap import warnings -from typing import Callable +from typing import Callable, Dict, Iterator from typing import TYPE_CHECKING from typing import Tuple from typing import List @@ -199,16 +200,18 @@ def mark_duplicate_entries( # of each newly extracted entry requires the existing entries # to be sorted by date. existing.sort(key=operator.attrgetter("date")) - dates = [entry.date for entry in existing] + dates: Dict[type, List[data.Directive]] = defaultdict(list) + for entry in existing: + dates[type(entry)].append(entry) - def entries_date_window_iterator(date): - lo = bisect.bisect_left(dates, date - window) - hi = bisect.bisect_right(dates, date + window) + def entries_date_window_iterator(entry_type: type, date: datetime.date) -> Iterator[data.Directive]: + lo = bisect.bisect_left(dates[entry_type], date - window, key=operator.attrgetter('date')) + hi = bisect.bisect_right(dates[entry_type], date + window, key=operator.attrgetter('date')) for i in range(lo, hi): - yield existing[i] + yield dates[entry_type][i] for entry in entries: - for target in entries_date_window_iterator(entry.date): + for target in entries_date_window_iterator(type(entry), entry.date): if compare(entry, target): entry.meta[DUPLICATE] = target diff --git a/beangulp/extract_test.py b/beangulp/extract_test.py index ceaa443..39f5056 100644 --- a/beangulp/extract_test.py +++ b/beangulp/extract_test.py @@ -63,6 +63,7 @@ def test_mark_duplicate_entries(self): 1970-01-02 * "Test" Assets:Tests 20.00 USD + 1970-01-03 balance Assets:Tests 20.00 USD """) ) compare = similar.heuristic_comparator() @@ -70,6 +71,8 @@ def test_mark_duplicate_entries(self): self.assertTrue(entries[0].meta[extract.DUPLICATE]) self.assertNotIn(extract.DUPLICATE, entries[1].meta) + self.assertFalse(extract.DUPLICATE in entries[2].meta) + class TestPrint(unittest.TestCase): def test_print_extracted_entries(self): From 7fbcfb097878798dd64c403c33dca8ff5be72422 Mon Sep 17 00:00:00 2001 From: Trim21 Date: Sat, 23 Aug 2025 00:18:44 +0800 Subject: [PATCH 2/2] check same type --- beangulp/extract.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/beangulp/extract.py b/beangulp/extract.py index 66e302e..1d5c211 100644 --- a/beangulp/extract.py +++ b/beangulp/extract.py @@ -212,8 +212,9 @@ def entries_date_window_iterator(entry_type: type, date: datetime.date) -> Itera for entry in entries: for target in entries_date_window_iterator(type(entry), entry.date): - if compare(entry, target): - entry.meta[DUPLICATE] = target + if type(entry) == type(target): + if compare(entry, target): + entry.meta[DUPLICATE] = target def print_extracted_entries(extracted: List[ExtractedEntry], output: io.TextIOBase) -> None: