Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
d8ca02d
modify usfm for chapter-level drafting to avoid import issues; move r…
mshannon-sil Mar 26, 2026
aef5d5d
move filtering before token processing
mshannon-sil Apr 15, 2026
e423708
add test case for chapter filtering
mshannon-sil Apr 15, 2026
1e2e999
make sure all text in \id is included
mshannon-sil Apr 16, 2026
707119c
update remark test and ensure remarks are added at the end of existin…
mshannon-sil Apr 16, 2026
e1865ea
add test case for including chapter 1 and header information
mshannon-sil Apr 16, 2026
4a8fb50
support both book-level and chapter-level remarks
mshannon-sil Apr 27, 2026
74dcf8f
add test case for multiple remarks for the same chapter
mshannon-sil Apr 27, 2026
ef9041f
fix init.py
mshannon-sil Apr 27, 2026
d9ecd2a
cover edge case of chapter as last marker
mshannon-sil Apr 27, 2026
e39f94c
handle malformed chapter numbers
mshannon-sil Apr 27, 2026
9cfd578
use parse_usfm; fix pass_remark test
mshannon-sil Apr 27, 2026
26a111d
Merge branch 'main' into incremental_draft
mshannon-sil Apr 28, 2026
911df51
don't filter if chapters is empty list
mshannon-sil Apr 28, 2026
cf0ea05
revert change
mshannon-sil Apr 28, 2026
e7f3c2c
Add MemoryParatextProjectTextUpdater and move MemoryParatextProjectFi…
mshannon-sil May 8, 2026
e284381
Move memory Paratext classes to testutils and remove filter_tokens_by…
mshannon-sil May 8, 2026
a11fa51
Fix import sorting in test files
mshannon-sil May 8, 2026
22af388
Mirror Machine's create_stylesheet in MemoryParatextProjectFileHandler
mshannon-sil May 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 37 additions & 3 deletions machine/corpora/paratext_project_text_updater_base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from abc import ABC
from typing import Callable, Iterable, Optional, Sequence, Union
from typing import Callable, Iterable, List, Optional, Sequence, Tuple, Union

from ..utils.string_utils import parse_integer
from .paratext_project_file_handler import ParatextProjectFileHandler
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
Expand All @@ -11,6 +12,8 @@
UpdateUsfmTextBehavior,
)
from .usfm_parser import parse_usfm
from .usfm_token import UsfmTokenType
from .usfm_tokenizer import UsfmToken, UsfmTokenizer
from .usfm_update_block_handler import UsfmUpdateBlockHandler, UsfmUpdateBlockHandlerError


Expand All @@ -30,14 +33,15 @@ def update_usfm(
self,
book_id: str,
rows: Optional[Sequence[UpdateUsfmRow]] = None,
chapters: Optional[Sequence[int]] = None,
full_name: Optional[str] = None,
text_behavior: UpdateUsfmTextBehavior = UpdateUsfmTextBehavior.PREFER_EXISTING,
paragraph_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
embed_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.PRESERVE,
style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
remarks: Optional[Iterable[str]] = None,
remarks: Optional[Iterable[Tuple[int, str]]] = None,
error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None,
compare_segments: bool = False,
) -> Optional[str]:
Expand All @@ -60,7 +64,10 @@ def update_usfm(
compare_segments=compare_segments,
)
try:
parse_usfm(usfm, handler, self._settings.stylesheet, self._settings.versification)
tokenizer = UsfmTokenizer(self._settings.stylesheet)
tokens = tokenizer.tokenize(usfm)
tokens = filter_tokens_by_chapter(tokens, chapters)
parse_usfm(tokens, handler, self._settings.stylesheet, self._settings.versification)
return handler.get_usfm(self._settings.stylesheet)
except Exception as e:
error_message = (
Expand All @@ -69,3 +76,30 @@ def update_usfm(
f". Error: '{e}'"
)
raise RuntimeError(error_message) from e


def filter_tokens_by_chapter(
tokens: Sequence[UsfmToken], chapters: Optional[Sequence[int]] = None
) -> Sequence[UsfmToken]:
if chapters is None:
return tokens
tokens_within_chapters: List[UsfmToken] = []
in_chapter: bool = False
in_id_marker: bool = False
for index, token in enumerate(tokens):
if index == 0 and token.marker == "id":
in_id_marker = True
if 1 in chapters:
in_chapter = True
elif in_id_marker and token.marker is not None and token.marker != "id":
in_id_marker = False
elif token.type == UsfmTokenType.CHAPTER:
chapter_num = parse_integer(token.data) if token.data else None
if chapter_num is not None and chapter_num in chapters:
in_chapter = True
else:
in_chapter = False

if in_id_marker or in_chapter:
tokens_within_chapters.append(token)
return tokens_within_chapters
47 changes: 35 additions & 12 deletions machine/corpora/update_usfm_parser_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(
style_behavior: UpdateUsfmMarkerBehavior = UpdateUsfmMarkerBehavior.STRIP,
preserve_paragraph_styles: Optional[Union[Iterable[str], str]] = None,
update_block_handlers: Optional[Iterable[UsfmUpdateBlockHandler]] = None,
remarks: Optional[Iterable[str]] = None,
remarks: Optional[Iterable[Tuple[int, str]]] = None,
error_handler: Optional[Callable[[UsfmUpdateBlockHandlerError], bool]] = None,
compare_segments: bool = False,
) -> None:
Expand Down Expand Up @@ -340,19 +340,42 @@ def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
tokenizer = UsfmTokenizer(stylesheet)
tokens = list(self._tokens)
if len(self._remarks) > 0:
remark_tokens: List[UsfmToken] = []
for remark in self._remarks:
remark_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem"))
remark_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark))
remark_tokens_by_chapter: Dict[int, List[UsfmToken]] = {}
for chapter_num, remark in self._remarks:
chapter_tokens = remark_tokens_by_chapter.setdefault(chapter_num, [])
chapter_tokens.append(UsfmToken(UsfmTokenType.PARAGRAPH, "rem"))
chapter_tokens.append(UsfmToken(UsfmTokenType.TEXT, text=remark))
if len(tokens) > 0:
index = 0
markers_to_skip = {"id", "ide", "rem"}
while tokens[index].marker in markers_to_skip:
index += 1
if len(tokens) > index and tokens[index].type == UsfmTokenType.TEXT:
for chapter_num, remark_tokens in remark_tokens_by_chapter.items():
if chapter_num == 0:
index = 0
markers_to_skip = {"id", "ide", "rem"}
else:
index = next(
(
i
for i, token in enumerate(tokens)
if token.type == UsfmTokenType.CHAPTER
and token.data is not None
and str(token.data).isdigit()
and int(token.data) == chapter_num
),
-1,
)
if index == -1:
continue
index += 1
for remark_token in reversed(remark_tokens):
tokens.insert(index, remark_token)
markers_to_skip = {"rem"}

if index >= len(tokens):
tokens.extend(remark_tokens)
else:
while index < len(tokens) and tokens[index].marker in markers_to_skip:
index += 1
if index < len(tokens) and tokens[index].type == UsfmTokenType.TEXT:
index += 1

tokens[index:index] = remark_tokens
return tokenizer.detokenize(tokens)

def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str], Optional[dict[str, object]]]:
Expand Down
2 changes: 1 addition & 1 deletion machine/corpora/usfm_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@


def parse_usfm(
usfm: str,
usfm: Union[str, Sequence[UsfmToken]],
handler: UsfmParserHandler,
stylesheet: Union[StrPath, UsfmStylesheet] = "usfm.sty",
versification: Optional[Versification] = None,
Expand Down
Loading
Loading