From 81177ca83d196d9c87a45e93c719cbfc3e22709b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:25:46 +0000 Subject: [PATCH 1/5] Initial plan From 83aa6590df76a2624fb065e454eea440585b39ab Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:30:57 +0000 Subject: [PATCH 2/5] Port 'Handle invalid book id tags more robustly' from machine to machine.py Agent-Logs-Url: https://github.com/sillsdev/machine.py/sessions/9308171f-88dc-496e-b821-a521af004991 Co-authored-by: ddaspit <3261883+ddaspit@users.noreply.github.com> --- machine/corpora/usfm_file_text_corpus.py | 7 ++- machine/corpora/usfm_parser.py | 2 + machine/corpora/usfm_text_base.py | 3 + tests/corpora/test_usfm_memory_text.py | 70 ++++++++++++++++++++++ tests/testutils/data/usfm/Tes/03LEVTes.SFM | 2 +- tests/testutils/data/usfm/Tes/131CHTes.SFM | 2 +- 6 files changed, 82 insertions(+), 4 deletions(-) diff --git a/machine/corpora/usfm_file_text_corpus.py b/machine/corpora/usfm_file_text_corpus.py index dbc5f675..5e28b037 100644 --- a/machine/corpora/usfm_file_text_corpus.py +++ b/machine/corpora/usfm_file_text_corpus.py @@ -42,7 +42,10 @@ def _get_id(filename: StrPath, encoding: str) -> Optional[str]: if line.startswith("\\id "): id = line[4:] index = id.find(" ") + # If the id is longer than 3 characters, truncate it to 3 characters. + if (index == -1 or index > 3) and len(id) >= 3: + index = 3 if index != -1: - id = id[:index] - return id.strip().upper() + id = id[:index].upper() + return id.strip() return None diff --git a/machine/corpora/usfm_parser.py b/machine/corpora/usfm_parser.py index a37d5396..220ade28 100644 --- a/machine/corpora/usfm_parser.py +++ b/machine/corpora/usfm_parser.py @@ -176,6 +176,8 @@ def process_token(self) -> bool: # Code is always upper case assert token.data is not None code = token.data.upper() + if len(code) > 3: + code = code[:3] # Update verse ref. Leave book alone if not empty to prevent parsing errors on books with bad id lines. verse_ref = self.state.verse_ref diff --git a/machine/corpora/usfm_text_base.py b/machine/corpora/usfm_text_base.py index ee400909..d50372c6 100644 --- a/machine/corpora/usfm_text_base.py +++ b/machine/corpora/usfm_text_base.py @@ -91,6 +91,9 @@ def rows(self) -> Iterable[TextRow]: def start_book(self, state: UsfmParserState, marker: str, code: str) -> None: super().start_book(state, marker, code) + if state.verse_ref.book != "" and state.verse_ref.book != code: + # Ignore \id markers that don't match the book code in the verse ref, if it was set + return if code not in ALL_BOOK_IDS: raise ValueError(f"The book {code} is not a valid book id.") if code != self._text.id: diff --git a/tests/corpora/test_usfm_memory_text.py b/tests/corpora/test_usfm_memory_text.py index aa89412d..5d990949 100644 --- a/tests/corpora/test_usfm_memory_text.py +++ b/tests/corpora/test_usfm_memory_text.py @@ -465,6 +465,76 @@ def test_get_rows_incomplete_verse_range(): assert rows[3].text == "verse 1 text" +def test_get_rows_book_code_different_to_filename() -> None: + import pytest + + with pytest.raises(Exception): + get_rows( + r"""\id LUK - Test +\c 1 +\v 1 Verse 1 Text +""", + include_all_text=True, + ) + + +def test_get_rows_book_code_invalid() -> None: + import pytest + + with pytest.raises(Exception): + get_rows( + r"""\id ZZZ - Test +\c 1 +\v 1 Verse 1 Text +""", + include_all_text=True, + ) + + +def test_get_rows_book_code_truncated() -> None: + import pytest + + with pytest.raises(Exception): + get_rows( + r"""\id MA +\c 1 +\v 1 Verse 1 Text +""", + include_all_text=True, + ) + + +def test_get_rows_book_code_multiple() -> None: + rows: List[TextRow] = get_rows( + r"""\id MAT +\id LUK +\c 1 +\v 1 Verse 1 Text +""", + include_all_text=True, + ) + + assert len(rows) == 1 + + assert rows[0].ref == ScriptureRef.parse("MAT 1:1"), str.join(",", [str(tr.ref) for tr in rows]) + assert rows[0].text == "Verse 1 Text", str.join(",", [tr.text for tr in rows]) + + +def test_get_rows_book_code_no_space() -> None: + rows: List[TextRow] = get_rows( + r"""\id Matthew +\c 1 +\v 1 Verse 1 Text +""", + include_all_text=True, + ) + + assert len(rows) == 1 + + assert rows[0].ref == ScriptureRef.parse("MAT 1:1"), str.join(",", [str(tr.ref) for tr in rows]) + assert rows[0].text == "Verse 1 Text", str.join(",", [tr.text for tr in rows]) + + def get_rows(usfm: str, include_markers: bool = False, include_all_text: bool = False) -> List[TextRow]: text = UsfmMemoryText( UsfmStylesheet("usfm.sty"), diff --git a/tests/testutils/data/usfm/Tes/03LEVTes.SFM b/tests/testutils/data/usfm/Tes/03LEVTes.SFM index 6fc8cd9f..2ced0843 100644 --- a/tests/testutils/data/usfm/Tes/03LEVTes.SFM +++ b/tests/testutils/data/usfm/Tes/03LEVTes.SFM @@ -1,4 +1,4 @@ -\id lev - Test +\id Leviticus \h Leviticus \mt Leviticus \c 14 diff --git a/tests/testutils/data/usfm/Tes/131CHTes.SFM b/tests/testutils/data/usfm/Tes/131CHTes.SFM index 328b513a..f080c42b 100644 --- a/tests/testutils/data/usfm/Tes/131CHTes.SFM +++ b/tests/testutils/data/usfm/Tes/131CHTes.SFM @@ -1,4 +1,4 @@ -\id 1CH - Test +\id 1CH \h 1 Chronicles \mt 1 Chronicles \c 12 From d2db8aa9685d98fcf97bd6277ef8714a63c2cdcb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 19:32:38 +0000 Subject: [PATCH 3/5] Fix: always uppercase the returned id in _get_id Agent-Logs-Url: https://github.com/sillsdev/machine.py/sessions/9308171f-88dc-496e-b821-a521af004991 Co-authored-by: ddaspit <3261883+ddaspit@users.noreply.github.com> --- machine/corpora/usfm_file_text_corpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine/corpora/usfm_file_text_corpus.py b/machine/corpora/usfm_file_text_corpus.py index 5e28b037..6ef8b8b2 100644 --- a/machine/corpora/usfm_file_text_corpus.py +++ b/machine/corpora/usfm_file_text_corpus.py @@ -47,5 +47,5 @@ def _get_id(filename: StrPath, encoding: str) -> Optional[str]: index = 3 if index != -1: id = id[:index].upper() - return id.strip() + return id.strip().upper() return None From fd84e591eadf370741dbeda5c8121f20d5a01a22 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 20:03:04 +0000 Subject: [PATCH 4/5] Move pytest import to top of test_usfm_memory_text.py Agent-Logs-Url: https://github.com/sillsdev/machine.py/sessions/65c86bdf-e2f9-4998-b0ba-5b62409675cd Co-authored-by: ddaspit <3261883+ddaspit@users.noreply.github.com> --- tests/corpora/test_usfm_memory_text.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/corpora/test_usfm_memory_text.py b/tests/corpora/test_usfm_memory_text.py index 5d990949..d25c9a29 100644 --- a/tests/corpora/test_usfm_memory_text.py +++ b/tests/corpora/test_usfm_memory_text.py @@ -1,5 +1,6 @@ from typing import List +import pytest from testutils.corpora_test_helpers import scripture_ref from machine.corpora import ScriptureRef, TextRow, UsfmMemoryText @@ -466,8 +467,6 @@ def test_get_rows_incomplete_verse_range(): def test_get_rows_book_code_different_to_filename() -> None: - import pytest - with pytest.raises(Exception): get_rows( r"""\id LUK - Test @@ -479,8 +478,6 @@ def test_get_rows_book_code_different_to_filename() -> None: def test_get_rows_book_code_invalid() -> None: - import pytest - with pytest.raises(Exception): get_rows( r"""\id ZZZ - Test @@ -492,8 +489,6 @@ def test_get_rows_book_code_invalid() -> None: def test_get_rows_book_code_truncated() -> None: - import pytest - with pytest.raises(Exception): get_rows( r"""\id MA From ad16cf11d354a72ece65ebaa51183b8ca126eda0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 23 Apr 2026 20:24:49 +0000 Subject: [PATCH 5/5] Use RuntimeError instead of Exception in book code error tests Agent-Logs-Url: https://github.com/sillsdev/machine.py/sessions/7f649636-4646-4f0b-ab79-e05fa6d8acda Co-authored-by: ddaspit <3261883+ddaspit@users.noreply.github.com> --- tests/corpora/test_usfm_memory_text.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/corpora/test_usfm_memory_text.py b/tests/corpora/test_usfm_memory_text.py index d25c9a29..b4f5f67e 100644 --- a/tests/corpora/test_usfm_memory_text.py +++ b/tests/corpora/test_usfm_memory_text.py @@ -467,7 +467,7 @@ def test_get_rows_incomplete_verse_range(): def test_get_rows_book_code_different_to_filename() -> None: - with pytest.raises(Exception): + with pytest.raises(RuntimeError): get_rows( r"""\id LUK - Test \c 1 @@ -478,7 +478,7 @@ def test_get_rows_book_code_different_to_filename() -> None: def test_get_rows_book_code_invalid() -> None: - with pytest.raises(Exception): + with pytest.raises(RuntimeError): get_rows( r"""\id ZZZ - Test \c 1 @@ -489,7 +489,7 @@ def test_get_rows_book_code_invalid() -> None: def test_get_rows_book_code_truncated() -> None: - with pytest.raises(Exception): + with pytest.raises(RuntimeError): get_rows( r"""\id MA \c 1