From 5d1c282359afa3b34d4c6792b5b2bfe11709d853 Mon Sep 17 00:00:00 2001 From: Rossi-Luciano Date: Sun, 31 May 2026 18:52:16 -0300 Subject: [PATCH 01/11] =?UTF-8?q?Implementa=20pipeline=20de=20identifica?= =?UTF-8?q?=C3=A7=C3=A3o=20e=20inser=C3=A7=C3=A3o=20autom=C3=A1tica=20de?= =?UTF-8?q?=20xref=20(DOCX=20=E2=86=92=20SPS=20XML)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adiciona módulo xref.py com detecção de estilo de citação (ABNT narrativo, ABNT parentético, Vancouver bracket, Vancouver superscript), marcação de bookmarks e hyperlinks no DOCX, validação de consistência e extração do mapeamento citação→rid. Integra o pipeline em tasks.py: o DOCX é marcado antes do processamento principal, o xref_map é aplicado em stream_data_body e repassado a get_xml(). Ranges Vancouver são expandidos para rid multi-valor (ex: "[26-27]" → "B26 B27"). Em xml.py, substitui o guard coarse `if 'xref' not in paragraph` por processamento por segmento via _apply_to_segments(), eliminando risco de double-wrapping e garantindo que citações em parágrafos parcialmente marcados não sejam ignoradas. O LLM (proccess_labeled_text) passa a atuar como fallback. Adiciona campos marked_file e xref_status ao modelo ArticleDocxMarkup, com widgets visuais, proxy model ProcessedDocx e views download_marked_docx e reprocess para suporte à revisão humana via interface Wagtail. Co-Authored-By: Claude Sonnet 4.6 --- .../0003_articledocxmarkup_marked_file.py | 18 + .../0004_articledocxmarkup_xref_status.py | 16 + markup_doc/models.py | 119 ++- markup_doc/tasks.py | 100 ++- markup_doc/views.py | 37 +- markup_doc/wagtail_hooks.py | 20 + markup_doc/xml.py | 122 ++- markup_doc/xref.py | 840 ++++++++++++++++++ 8 files changed, 1224 insertions(+), 48 deletions(-) create mode 100644 markup_doc/migrations/0003_articledocxmarkup_marked_file.py create mode 100644 markup_doc/migrations/0004_articledocxmarkup_xref_status.py create mode 100644 markup_doc/xref.py diff --git a/markup_doc/migrations/0003_articledocxmarkup_marked_file.py b/markup_doc/migrations/0003_articledocxmarkup_marked_file.py new file mode 100644 index 0000000..509a53e --- /dev/null +++ b/markup_doc/migrations/0003_articledocxmarkup_marked_file.py @@ -0,0 +1,18 @@ +# Generated by Django 6.0.5 on 2026-05-26 14:16 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('markup_doc', '0002_alter_articledocx_estatus_and_more'), + ] + + operations = [ + migrations.AddField( + model_name='articledocxmarkup', + name='marked_file', + field=models.FileField(blank=True, null=True, upload_to='uploads_docx_marked/', verbose_name='Marked Document'), + ), + ] diff --git a/markup_doc/migrations/0004_articledocxmarkup_xref_status.py b/markup_doc/migrations/0004_articledocxmarkup_xref_status.py new file mode 100644 index 0000000..c7c1448 --- /dev/null +++ b/markup_doc/migrations/0004_articledocxmarkup_xref_status.py @@ -0,0 +1,16 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('markup_doc', '0003_articledocxmarkup_marked_file'), + ] + + operations = [ + migrations.AddField( + model_name='articledocxmarkup', + name='xref_status', + field=models.JSONField(blank=True, null=True, verbose_name='XRef Status'), + ), + ] \ No newline at end of file diff --git a/markup_doc/models.py b/markup_doc/models.py index b3ef32f..ae16222 100644 --- a/markup_doc/models.py +++ b/markup_doc/models.py @@ -1,10 +1,13 @@ +import json +import os + from django import forms from django.db import models from django.urls import reverse -from django.utils.html import format_html +from django.utils.html import format_html, mark_safe from django.utils.translation import gettext_lazy as _ from modelcluster.models import ClusterableModel -from wagtail.admin.panels import FieldPanel, ObjectList, TabbedInterface +from wagtail.admin.panels import FieldPanel, ObjectList, Panel, TabbedInterface from wagtail.blocks import ChoiceBlock, StreamBlock, StructBlock, TextBlock from wagtail.fields import StreamField from wagtail.images.blocks import ImageChooserBlock @@ -25,8 +28,6 @@ class ProcessStatus(models.IntegerChoices): class ReadOnlyFileWidget(forms.Widget): def render(self, name, value, attrs=None, renderer=None): if value: - # Muestra el archivo como un enlace de descarga - # return format_html('{}', value.url, value.name.split('/')[-1]) instance = value.instance url = reverse("generate_xml", args=[instance.pk]) return format_html( @@ -35,6 +36,74 @@ def render(self, name, value, attrs=None, renderer=None): return "" +class DownloadMarkedFileWidget(forms.Widget): + def render(self, name, value, attrs=None, renderer=None): + if value: + instance = value.instance + url = reverse("download_marked_docx", args=[instance.pk]) + filename = os.path.basename(value.name) + return format_html( + '{}', url, filename + ) + return mark_safe('Não disponível ainda') + + +class XrefStatusWidget(forms.Widget): + def render(self, name, value, attrs=None, renderer=None): + if isinstance(value, str): + try: + value = json.loads(value) + except (json.JSONDecodeError, ValueError): + value = None + if not value: + return mark_safe('Não processado') + valid = value.get("valid", False) + total_refs = value.get("total_references", 0) + total_cits = value.get("total_citations", 0) + orphaned_bk = len(value.get("orphaned_bookmarks", [])) + orphaned_hl = value.get("orphaned_hyperlinks", []) + color = "green" if valid else "red" + status = "Válido" if valid else "Inválido" + html = format_html( + '

{}   {} referências | {} citações linkadas

', + color, status, total_refs, total_cits, + ) + if orphaned_hl: + html += format_html( + '

Citações sem referência: {}

', + ', '.join(orphaned_hl), + ) + if orphaned_bk: + html += format_html( + '

{} referência(s) sem citação no texto

', + orphaned_bk, + ) + return html + + +class ReprocessButtonPanel(Panel): + def __init__(self, confirm_message="Reprocessar este documento?", **kwargs): + super().__init__(**kwargs) + self.confirm_message = confirm_message + + def clone_kwargs(self): + return {**super().clone_kwargs(), "confirm_message": self.confirm_message} + + class BoundPanel(Panel.BoundPanel): + def render_html(self, parent_context=None): + if not self.instance or not self.instance.pk: + return "" + url = reverse("reprocess", args=[self.instance.pk]) + msg = self.panel.confirm_message.replace("'", "\\'") + return mark_safe( + f'
' + f'' + f'Reprocessar
' + ) + + class ArticleDocx(CommonControlField): title = models.TextField(_("Document Title"), null=True, blank=True) file = models.FileField( @@ -278,6 +347,17 @@ class ArticleDocxMarkup(CommonControlField, ClusterableModel): verbose_name=_("Document"), upload_to="uploads_docx/", ) + marked_file = models.FileField( + null=True, + blank=True, + verbose_name=_("Marked Document"), + upload_to="uploads_docx_marked/", + ) + xref_status = models.JSONField( + _("XRef Status"), + null=True, + blank=True, + ) estatus = models.IntegerField( _("Process estatus"), choices=ProcessStatus.choices, @@ -380,6 +460,16 @@ def __str__(self): title = self.title or "" return f"{title} | {self.estatus}" + def get_marked_file_status(self): + if not self.marked_file: + return "Aguardando processamento" + if self.xref_status: + total = self.xref_status.get("total_references", 0) + cits = self.xref_status.get("total_citations", 0) + return f"✓ Disponível ({total} refs, {cits} citações)" + return "✓ Disponível" + get_marked_file_status.short_description = _("DOCX Marcado") + @property def url_download(self): return self.file_xml.url if self.file_xml else None @@ -442,6 +532,9 @@ class MarkupXML(ArticleDocxMarkup): panels_xml = [ FieldPanel("file_xml", widget=ReadOnlyFileWidget()), FieldPanel("text_xml"), + ReprocessButtonPanel( + confirm_message="Isso irá descartar as edições manuais e reprocessar o DOCX original. Continuar?" + ), ] panels_details = [ @@ -489,3 +582,21 @@ class MarkupXML(ArticleDocxMarkup): class Meta: proxy = True + + +class ProcessedDocx(ArticleDocxMarkup): + panels_doc = [ + FieldPanel("title"), + FieldPanel("marked_file", widget=DownloadMarkedFileWidget()), + FieldPanel("xref_status", widget=XrefStatusWidget()), + ReprocessButtonPanel(confirm_message="Reprocessar este documento?"), + ] + + edit_handler = TabbedInterface([ + ObjectList(panels_doc, heading=_("DOCX Marcado")), + ]) + + class Meta: + proxy = True + verbose_name = _("DOCX processado") + verbose_name_plural = _("DOCXs processados") diff --git a/markup_doc/tasks.py b/markup_doc/tasks.py index a2e5717..b4f3988 100644 --- a/markup_doc/tasks.py +++ b/markup_doc/tasks.py @@ -1,7 +1,9 @@ # Local application imports # Standard library imports +import io import json import logging +import os import re # Third-party imports @@ -26,6 +28,13 @@ from markup_doc.models import MarkupXML, ProcessStatus, UploadDocx from markup_doc.sync_api import sync_journals_from_api from markup_doc.xml import get_xml +from markup_doc.xref import ( + build_text_xref_replacer, + is_marked, + mark_references, + read_marks, + validate_marks, +) from markuplib.function_docx import functionsDocx from model_ai.llama import LlamaInputSettings, LlamaService from reference.config_gemini import create_prompt_reference @@ -87,6 +96,70 @@ def get_labels(article_id, user_id): llm_model, ) doc = functionsDocx.openDocx(article_docx.file.path) + + if not is_marked(doc): + doc = mark_references(doc) + + xref_validation = validate_marks(doc) + if not xref_validation["valid"]: + for err in xref_validation["errors"]: + print(f"[xref] ERROR: {err}") + + article_docx.xref_status = { + "valid": xref_validation["valid"], + "total_references": len(xref_validation["bookmarks"]), + "total_citations": len(xref_validation["hyperlinks"]), + "orphaned_bookmarks": xref_validation["orphaned_bookmarks"], + "orphaned_hyperlinks": xref_validation["orphaned_hyperlinks"], + "warnings": xref_validation["warnings"], + "errors": xref_validation["errors"], + } + + ref_marks = read_marks(doc) + xref_map = { + cit: ref["rid"] + for ref in ref_marks + for cit in ref["citations"] + if cit + } + # Expand Vancouver range/multi citations to include all rids. + # e.g. "[26-27]" linked to B26 should produce rid="B26 B27"; + # "[3,4,5]" linked to B3 should produce rid="B3 B4 B5". + _bracket_re = re.compile(r'^\[(\d+(?:[,\-]\d+)*)\]$') + for cit, rid in list(xref_map.items()): + m = _bracket_re.match(cit.strip()) + if not m: + continue + numbers = [] + for part in m.group(1).split(','): + part = part.strip() + if '-' in part: + a, b = part.split('-', 1) + try: + numbers.extend(range(int(a), int(b) + 1)) + except ValueError: + pass + else: + try: + numbers.append(int(part)) + except ValueError: + pass + if len(numbers) > 1: + xref_map[cit] = ' '.join(f'B{n}' for n in numbers) + italic_variants = { + cit.replace("et al.", "et al."): rid + for cit, rid in xref_map.items() + if "et al." in cit + } + xref_map.update(italic_variants) + text_xref_fn = build_text_xref_replacer(doc) + + buf = io.BytesIO() + doc.save(buf) + buf.seek(0) + marked_name = os.path.splitext(os.path.basename(article_docx.file.name))[0] + "_marked.docx" + article_docx.marked_file.save(marked_name, ContentFile(buf.read()), save=False) + sections, content = functionsDocx().extractContent(doc, article_docx.file.path) article_docx_markup = article_docx text_title = "" @@ -361,6 +434,31 @@ def get_labels(article_id, user_id): stream_data_back.extend(process_references(num_refs, output)) + # data_front is never iterated inside get_xml — rescue any

items that the + # state machine left in stream_data (body paragraphs misclassified as front + # because their section headings use named Word styles with font_size=0). + rescued = [item for item in stream_data if item.get('value', {}).get('label') == '

'] + if rescued: + stream_data_body = rescued + stream_data_body + stream_data = [item for item in stream_data if item not in rescued] + + # Apply xref_map (DOCX hyperlinks) and narrative Author (year) xrefs to body. + for item in stream_data_body: + if item.get('value', {}).get('label') == '

': + para = item['value'].get('paragraph', '') or '' + if not para: + continue + # 1. Dict-based from DOCX hyperlinks + if xref_map: + for cit_text, rid in sorted(xref_map.items(), key=lambda x: -len(x[0])): + para = para.replace( + cit_text, + f'{cit_text}', + ) + # 2. Narrative "Author (year)" citations + para = text_xref_fn(para) + item['value']['paragraph'] = para + article_docx_markup.content = stream_data article_docx_markup.content_body = stream_data_body article_docx_markup.content_back = stream_data_back @@ -368,7 +466,7 @@ def get_labels(article_id, user_id): article_docx_markup.save() xml, stream_data_body = get_xml( - article_docx, stream_data, stream_data_body, stream_data_back + article_docx, stream_data, stream_data_body, stream_data_back, xref_map=xref_map ) persist_article_xml(article_docx_markup, xml, stream_data_body) diff --git a/markup_doc/views.py b/markup_doc/views.py index 9c002da..ac2cfe4 100644 --- a/markup_doc/views.py +++ b/markup_doc/views.py @@ -1,5 +1,6 @@ -from django.shortcuts import render -from django.http import HttpResponse, HttpResponseBadRequest, Http404 +from django.contrib import messages +from django.shortcuts import render, redirect +from django.http import FileResponse, HttpResponse, HttpResponseBadRequest, Http404 from .models import ArticleDocxMarkup #from .xml import extraer_citas_apa from django.http import JsonResponse @@ -53,6 +54,38 @@ def generate_xml(request, id_registro): return HttpResponse(f"Error al generar el XML: {str(e)}", status=500) +def download_marked_docx(request, pk): + try: + registro = ArticleDocxMarkup.objects.get(pk=pk) + if not registro.marked_file: + raise Http404("Arquivo marcado não disponível") + filename = os.path.basename(registro.marked_file.name) + return FileResponse( + registro.marked_file.open("rb"), + as_attachment=True, + filename=filename, + ) + except ArticleDocxMarkup.DoesNotExist: + raise Http404("Registro não encontrado") + + +def reprocess(request, pk): + from markup_doc.tasks import get_labels + from markup_doc.models import ProcessStatus + try: + registro = ArticleDocxMarkup.objects.get(pk=pk) + if not registro.file or not registro.file.name: + messages.error(request, "Arquivo DOCX original não encontrado.") + return redirect(request.META.get("HTTP_REFERER", "/admin/")) + registro.estatus = ProcessStatus.PROCESSING + registro.save(update_fields=["estatus"]) + get_labels.delay(registro.pk, request.user.id) + messages.success(request, f'Reprocessamento iniciado para "{registro.title}".') + except ArticleDocxMarkup.DoesNotExist: + messages.error(request, "Registro não encontrado.") + return redirect(request.META.get("HTTP_REFERER", "/admin/")) + + def extract_citation(request): if request.method == "POST": diff --git a/markup_doc/wagtail_hooks.py b/markup_doc/wagtail_hooks.py index 4eb729b..bf2f9f7 100644 --- a/markup_doc/wagtail_hooks.py +++ b/markup_doc/wagtail_hooks.py @@ -22,6 +22,7 @@ CollectionModel, JournalModel, MarkupXML, + ProcessedDocx, ProcessStatus, UploadDocx, ) @@ -40,6 +41,12 @@ def register_admin_urls(): path( "download-xml//", views.generate_xml, name="generate_xml" ), + path( + "download-marked-docx//", + views.download_marked_docx, + name="download_marked_docx", + ), + path("reprocess//", views.reprocess, name="reprocess"), path("extract-citation/", views.extract_citation, name="extract_citation"), path("get_journal/", views.get_journal, name="get_journal"), path("download-zip/", views.generate_zip, name="generate_zip"), @@ -184,6 +191,18 @@ def index_view(self, request): return response +class ProcessedDocxViewSet(SnippetViewSet): + model = ProcessedDocx + menu_label = _("DOCX processado") + menu_icon = "doc-full-inverse" + add_to_admin_menu = False + exclude_from_explorer = False + list_per_page = 20 + list_display = ("title", "get_estatus_display", "get_marked_file_status") + search_fields = ("title",) + list_filter = ("estatus",) + + class XMLSPSSnippetViewSetGroup(SnippetViewSetGroup): menu_name = "xml_sps" menu_label = _("XML SPS") @@ -214,6 +233,7 @@ class MarkupSnippetViewSetGroup(SnippetViewSetGroup): menu_order = get_menu_order("markup_doc") items = ( UploadDocxViewSet, + ProcessedDocxViewSet, XMLSPSSnippetViewSetGroup, ) diff --git a/markup_doc/xml.py b/markup_doc/xml.py index 3698ca9..e94ccf8 100644 --- a/markup_doc/xml.py +++ b/markup_doc/xml.py @@ -16,6 +16,45 @@ proccess_special_content, sanitize_inline_xml_fragment, ) +from markup_doc.xref import make_text_xref_fn_from_refs + +_XREF_SPLIT_RE = re.compile(r'(]*>.*?)', re.DOTALL) + + +def _apply_to_segments(text, fn): + """Apply fn only to plain-text segments, leaving existing tags intact.""" + parts = _XREF_SPLIT_RE.split(text) + return ''.join(fn(part) if i % 2 == 0 else part for i, part in enumerate(parts)) + + +def _apply_xref_map(paragraph, xref_map): + """Apply xref_map replacements segment-by-segment to avoid double-wrapping.""" + def replace_in_segment(seg): + for cit_text, rid in sorted(xref_map.items(), key=lambda x: -len(x[0])): + seg = seg.replace( + cit_text, + f'{cit_text}', + ) + return seg + return _apply_to_segments(paragraph, replace_in_segment) + + +def _apply_proccess_labeled_text(paragraph, data_back): + """Apply proccess_labeled_text to each plain-text segment independently.""" + def process_segment(seg): + if not seg: + return seg + refs = proccess_labeled_text(seg, data_back) + for r in refs: + if r.get('refid') and not re.search( + rf']*>{re.escape(r["cita"])}', seg + ): + seg = seg.replace( + r['cita'], + f'{r["cita"]}', + ) + return seg + return _apply_to_segments(paragraph, process_segment) def extract_date(texto): @@ -42,7 +81,18 @@ def extract_date(texto): return None # No se encontró -def get_xml(article_docx, data_front, data, data_back): +def get_xml(article_docx, data_front, data, data_back, xref_map=None): + # Build narrative Author (year) xref replacer from data_back reference texts + _text_xref_refs = [ + { + 'rid': item['value'].get('refid') or f'B{i + 1}', + 'ref_text': item['value'].get('paragraph') or '', + } + for i, item in enumerate(data_back) + if item.get('value') + ] + _text_xref_fn = make_text_xref_fn_from_refs(_text_xref_refs) + # Crear el elemento raíz nsmap = { "mml": "http://www.w3.org/1998/Math/MathML", @@ -422,10 +472,12 @@ def get_xml(article_docx, data_front, data, data_back): node_tmp = etree.SubElement(node, "abstract") - if vals: + if vals and vals[0]: node_tmp2 = etree.SubElement(node_tmp, "title") append_fragment(node_tmp2, vals[0].value.get("paragraph")) + if vals2 and vals2[0]: + if vals2: # Encuentra su índice original en article_docx.content last_index = data_t.index(vals2[0]) @@ -622,6 +674,7 @@ def get_xml(article_docx, data_front, data, data_back): node_table_text = re.sub(r"\s*\n\s*", "", node_table_text).replace( "
", "" ) + node_table_text = re.sub(r"&(?!\w+;|#\d+;)", "&", node_table_text) tabla_element = parse_xml_fragment(node_table_text) @@ -716,31 +769,21 @@ def get_xml(article_docx, data_front, data, data_back): else: node_p = etree.SubElement(node, "p") - # refs = extraer_citas_apa(d['value']['paragraph'].replace('[style name="italic"]', '').replace('[/style]', ''), data_back) - # refs = extraer_citas_apa(d['value']['paragraph'].replace('', '').replace('', ''), data_back) - if "xref" not in d["value"]["paragraph"]: - refs = proccess_labeled_text(d["value"]["paragraph"], data_back) - for r in refs: - # print(f"r in refs: {r}") - d["value"]["paragraph"] = d["value"]["paragraph"].replace( - r["cita"], - f"{r['cita']}", - ) - """ - if 'et al' in r['cita']: - et_al_replace = r['cita'].replace('et al', 'et al') - d['value']['paragraph'] = d['value']['paragraph'].replace(et_al_replace, f"{et_al_replace}") - else: - #print(r['cita']) - d['value']['paragraph'] = d['value']['paragraph'].replace(r['cita'], f"{r['cita']}") - """ - - elements = proccess_special_content(d["value"]["paragraph"], data) - for e in elements: - d["value"]["paragraph"] = d["value"]["paragraph"].replace( - e["label"], - f"{e['label']}", - ) + # Apply all xref passes to every paragraph, operating segment-by-segment + # so that citations already marked by tasks.py pre-processing are not + # double-wrapped, and citations in the same paragraph that were missed + # still get processed. + if xref_map: + d["value"]["paragraph"] = _apply_xref_map(d["value"]["paragraph"], xref_map) + d["value"]["paragraph"] = _text_xref_fn(d["value"]["paragraph"]) + d["value"]["paragraph"] = _apply_proccess_labeled_text(d["value"]["paragraph"], data_back) + + elements = proccess_special_content(d["value"]["paragraph"], data) + for e in elements: + d["value"]["paragraph"] = d["value"]["paragraph"].replace( + e["label"], + f"{e['label']}", + ) append_fragment(node_p, d["value"]["paragraph"]) @@ -808,19 +851,16 @@ def get_xml(article_docx, data_front, data, data_back): append_fragment(node_tit, d["value"]["paragraph"]) if d["value"]["label"] == "

": values = d["value"] - node_ref = etree.SubElement( - node_reflist, "ref", attrib={"id": values["refid"]} - ) - # node_label = etree.SubElement(node_ref, 'label') - # append_fragment(node_label, values['refid'].replace('B', '')) + refid = values.get("refid") or f"B{i + 1}" + node_ref = etree.SubElement(node_reflist, "ref", attrib={"id": refid}) node_mix = etree.SubElement(node_ref, "mixed-citation") append_fragment(node_mix, values["paragraph"]) - if values["reftype"] == "journal": + if values.get("reftype") == "journal": node_elem = etree.SubElement( node_ref, "element-citation", - attrib={"publication-type": values["reftype"]}, + attrib={"publication-type": values.get("reftype")}, ) node_person = etree.SubElement( node_elem, "person-group", attrib={"person-group-type": "author"} @@ -874,7 +914,7 @@ def get_xml(article_docx, data_front, data, data_back): values["uri"], ) - if values["reftype"] == "book": + if values.get("reftype") == "book": node_elem = etree.SubElement( node_ref, "element-citation", @@ -911,11 +951,11 @@ def get_xml(article_docx, data_front, data, data_back): etree.SubElement(node_ref, "lpage"), str(values["lpage"]) ) - if values["reftype"] == "data": + if values.get("reftype") == "data": node_elem = etree.SubElement( node_ref, "element-citation", - attrib={"publication-type": values["reftype"]}, + attrib={"publication-type": values.get("reftype")}, ) node_person = etree.SubElement( node_elem, "person-group", attrib={"person-group-type": "author"} @@ -952,11 +992,11 @@ def get_xml(article_docx, data_front, data, data_back): values["uri"], ) - if values["reftype"] == "webpage": + if values.get("reftype") == "webpage": node_elem = etree.SubElement( node_ref, "element-citation", - attrib={"publication-type": values["reftype"]}, + attrib={"publication-type": values.get("reftype")}, ) node_person = etree.SubElement( node_elem, "person-group", attrib={"person-group-type": "author"} @@ -985,11 +1025,11 @@ def get_xml(article_docx, data_front, data, data_back): etree.SubElement(node_ref, "access-date"), values["access_date"] ) - if values["reftype"] == "confproc": + if values.get("reftype") == "confproc": node_elem = etree.SubElement( node_ref, "element-citation", - attrib={"publication-type": values["reftype"]}, + attrib={"publication-type": values.get("reftype")}, ) node_person = etree.SubElement( node_elem, "person-group", attrib={"person-group-type": "author"} diff --git a/markup_doc/xref.py b/markup_doc/xref.py new file mode 100644 index 0000000..ae0c5eb --- /dev/null +++ b/markup_doc/xref.py @@ -0,0 +1,840 @@ +""" +Cross-reference (xref) linking for the DOCX → SPS XML pipeline. + +Official convention +------------------- +- Each reference entry in the reference list receives a bookmark named + ``xref_B{n}`` (1-indexed, n = position in the reference list). +- Each in-text citation becomes a Word internal hyperlink whose anchor + points to the corresponding ``xref_B{n}`` bookmark. + +This convention allows: +- Clicking a citation in Word → jumps to the reference entry. +- Clicking the reference entry bookmark → jumps back (if a reverse + hyperlink is added by the editor). + +Supported citation styles (auto-detected for unmarked documents): +- ABNT : (Autor, 2020) or (Autor et al., 2020) +- Vancouver bracket : [1] or [7,8] or [3-5] +- Vancouver superscript: runs with font.superscript == True containing digits + +Validation rules: +- ERROR : a hyperlink points to a bookmark that does not exist. +- WARNING : a bookmark has no corresponding hyperlink (uncited reference). +""" + +import copy +import re +import unicodedata + +from docx import Document +from docx.oxml import OxmlElement +from docx.oxml.ns import qn + +BOOKMARK_PREFIX = "xref_B" + +_REF_HEADINGS = { + "references", + "referências", + "referências bibliográficas", + "referencias", + "referencias bibliográficas", + "bibliography", + "bibliografia", +} + +_STOP_HEADINGS = { + "figures captions", + "figure captions", + "figures", + "supplementary material", + "supplementary materials", + "appendix", + "appendices", + "supporting information", + "acknowledgements", + "acknowledgments", + "agradecimentos", + "material suplementar", + "notas", + "notes", + # author/editor metadata sections + "author contributions", + "contribuições dos autores", + "contribuciones de los autores", + "data availability", + "data availability statement", + "disponibilidade dos dados", + "funding", + "financiamento", + "conflict of interest", + "conflicts of interest", + "conflito de interesses", + "declaration of competing interest", + "editors", + "editor associado", + "editor científico", + "associate editor", + "scientific editor", +} + +_ALLCAPS_STOP_RE = re.compile(r'^[A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒ\s\-]{4,60}$') + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def is_marked(doc: Document) -> bool: + """Return True if *doc* contains xref_B* bookmarks AND hyperlinks.""" + xml = doc.element.xml + has_bk = bool(re.search(rf'w:name="{BOOKMARK_PREFIX}\d+"', xml)) + has_hl = bool(re.search(rf'w:anchor="{BOOKMARK_PREFIX}\d+"', xml)) + return has_bk and has_hl + + +def validate_marks(doc: Document) -> dict: + """ + Validate consistency of xref markup. + + Returns a dict:: + + { + "valid": bool, # False when any hyperlink is orphaned + "bookmarks": set[str], # all xref_B* bookmarks found + "hyperlinks": set[str], # all xref_B* anchors found + "orphaned_bookmarks": list, # bookmarks without a citation (warnings) + "orphaned_hyperlinks": list, # citations without a reference (errors) + "warnings": list[str], + "errors": list[str], + } + """ + xml = doc.element.xml + bookmarks = set(re.findall(rf'w:name="({BOOKMARK_PREFIX}\d+)"', xml)) + hyperlinks = set(re.findall(rf'w:anchor="({BOOKMARK_PREFIX}\d+)"', xml)) + + orphaned_bk = sorted(bookmarks - hyperlinks) + orphaned_hl = sorted(hyperlinks - bookmarks) + + warnings = [f"Reference {b} has no in-text citation." for b in orphaned_bk] + errors = [f"Citation links to {h} but no matching reference bookmark found." for h in orphaned_hl] + + return { + "valid": len(orphaned_hl) == 0, + "bookmarks": bookmarks, + "hyperlinks": hyperlinks, + "orphaned_bookmarks": orphaned_bk, + "orphaned_hyperlinks": orphaned_hl, + "warnings": warnings, + "errors": errors, + } + + +def read_marks(doc: Document) -> list: + """ + Extract xref data from a marked document. + + Returns a list of dicts (one per reference), ordered by bookmark index:: + + [ + { + "rid": "B1", + "bookmark": "xref_B1", + "ref_text": "AUTOR, A. 2020. Título...", + "citations": ["(Autor, 2020)", ...], # in-text citation texts + }, + ... + ] + """ + xml = doc.element.xml + + # Collect all bookmark names present + bk_names = sorted( + set(re.findall(rf'w:name="({BOOKMARK_PREFIX}\d+)"', xml)), + key=lambda s: int(s[len(BOOKMARK_PREFIX):]), + ) + + # Map anchor → list of citation texts extracted from hyperlinks + citation_map: dict[str, list[str]] = {b: [] for b in bk_names} + + # Scan ALL paragraphs (including those inside table cells) + for p_elem in doc.element.body.iter(qn("w:p")): + p_xml = p_elem.xml + for m in re.finditer( + rf']+w:anchor="({BOOKMARK_PREFIX}\d+)"[^>]*>(.*?)', + p_xml, + re.DOTALL, + ): + anchor = m.group(1) + inner = m.group(2) + # Extract plain text from the hyperlink's runs and unescape XML entities + texts = re.findall(r']*>([^<]*)', inner) + citation_text = "".join(texts).strip().replace("&", "&").replace("<", "<").replace(">", ">").replace(""", '"').replace("'", "'") + if anchor in citation_map: + citation_map[anchor].append(citation_text) + + # Map bookmark → reference paragraph text + ref_paragraphs = _find_references_section(doc) + ref_text_map: dict[str, str] = {} + for idx, (_, para) in enumerate(ref_paragraphs, start=1): + bk = f"{BOOKMARK_PREFIX}{idx}" + ref_text_map[bk] = para.text.strip() + + result = [] + for bk in bk_names: + n = bk[len(BOOKMARK_PREFIX):] + result.append({ + "rid": f"B{n}", + "bookmark": bk, + "ref_text": ref_text_map.get(bk, ""), + "citations": citation_map.get(bk, []), + }) + return result + + +def mark_references(doc: Document) -> Document: + """ + Auto-detect citations and add xref markup to *doc*. + + 1. Adds ``xref_B{n}`` bookmarks to each reference entry. + 2. Detects the citation style (ABNT, Vancouver bracket, superscript). + 3. Wraps in-text citations in internal hyperlinks pointing to the + corresponding bookmark. + + Returns the modified Document (same object, mutated in place). + """ + refs = _find_references_section(doc) + if not refs: + return doc + + # Step 1 — bookmark each reference + bk_id_start = _next_bookmark_id(doc) + for offset, (_, para) in enumerate(refs): + bk_name = f"{BOOKMARK_PREFIX}{offset + 1}" + _add_bookmark_to_para(para, bk_name, bk_id_start + offset) + + # Build reference index for matching + ref_index = _build_ref_index(refs) + + # Step 2 — detect style and find citations + style = _detect_citation_style(doc) + + if style == "vancouver_bracket": + citations = _find_citations_bracket(doc) + elif style == "vancouver_superscript": + citations = _find_citations_superscript(doc) + else: + citations = _find_citations_abnt(doc, ref_index) + + # Step 3 — insert hyperlinks + for para, spans in citations.items(): + _insert_hyperlinks(para, spans) + + return doc + + +# --------------------------------------------------------------------------- +# Detection helpers +# --------------------------------------------------------------------------- + +def _detect_citation_style(doc: Document) -> str: + """Return 'abnt', 'vancouver_bracket', or 'vancouver_superscript'.""" + body_paras = _body_paragraphs(doc) + full_text = " ".join(p.text for p in body_paras) + + # Bracket citations [1] or [1,2] are the most unambiguous signal. + brackets = re.findall(r'\[\d+(?:[,\-]\d+)*\]', full_text) + if len(brackets) >= 3: + return "vancouver_bracket" + + # Superscript digit runs — require a high count to avoid mistaking + # footnote markers or ordinals in ABNT documents. + sup_count = sum( + 1 + for para in body_paras + for run in para.runs + if run.font.superscript and re.fullmatch(r'[\d,\s\-]+', run.text.strip()) + ) + abnt_count = len(re.findall( + r'\([A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇ][^()]{2,80}\d{4}[^()]*\)', full_text + )) + # Declare superscript only when it clearly dominates over ABNT matches. + if sup_count >= 10 and sup_count > abnt_count * 3: + return "vancouver_superscript" + + return "abnt" + + +def _iter_all_paragraphs(doc: Document): + """Yield all Paragraph objects in document order, including inside tables.""" + from docx.text.paragraph import Paragraph as _Para + for p_elem in doc.element.body.iter(qn("w:p")): + yield _Para(p_elem, doc) + + +_METADATA_RE = re.compile( + r'^(?:received|accepted|published|available\s+at|doi\s*:|https?://)', + re.IGNORECASE, +) + +_YEAR_RE_SIMPLE = re.compile(r'\b(?:1[89]|20)\d{2}\b') + +def _find_references_section(doc: Document) -> list: + """Return list of (paragraph_index, paragraph) for reference entries.""" + in_refs = False + refs = [] + for i, para in enumerate(_iter_all_paragraphs(doc)): + text = para.text.strip() + text_lower = text.lower() + if text_lower in _REF_HEADINGS: + in_refs = True + continue + if not in_refs: + continue + # Stop at known post-reference section headings + if text_lower in _STOP_HEADINGS: + break + # Stop at Word heading styles (Heading 1/2/3/...) + style_name = (para.style.name or '') if para.style else '' + if re.match(r'heading\s*\d', style_name, re.IGNORECASE): + break + # Stop at ALL-CAPS short paragraphs without a year — section headings + # like "CONTRIBUIÇÕES DOS AUTORES", "EDITOR ASSOCIADO", etc. + if (text and len(text) <= 60 + and _ALLCAPS_STOP_RE.match(text) + and not _YEAR_RE_SIMPLE.search(text)): + break + if text and not _METADATA_RE.match(text): + refs.append((i, para)) + return refs + + +def _build_ref_index(refs: list) -> list: + """Return list of (n, first_author_normalized, year, para) for ABNT matching.""" + index = [] + year_re = re.compile(r'\b((?:1[89]|20)\d{2}[a-z]?)\b') + for n, (_, para) in enumerate(refs, start=1): + text = para.text.strip() + year_m = year_re.search(text) + year = year_m.group(1) if year_m else "" + first_author = _normalize(_first_surname(text)) + index.append((n, first_author, year, para)) + return index + + +def _first_surname(ref_text: str) -> str: + """Extract the first author surname from a reference string.""" + # ABNT: SOBRENOME, Iniciais. → first word before comma + # Vancouver: Sobrenome AB, ... → first word + m = re.match(r'^([A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒ][A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒa-záéíóúàâêôãõüçäöïëøåæœ\-]+)', ref_text.strip()) + return m.group(1) if m else ref_text[:10] + + +def _normalize(text: str) -> str: + """Lowercase + remove accents for fuzzy comparison.""" + nfkd = unicodedata.normalize("NFKD", text) + return "".join(c for c in nfkd if not unicodedata.combining(c)).lower() + + +def _body_paragraphs(doc: Document) -> list: + """Return paragraphs that belong to the article body (before references).""" + body = [] + for para in _iter_all_paragraphs(doc): + if para.text.strip().lower() in _REF_HEADINGS: + break + body.append(para) + return body + + +# --------------------------------------------------------------------------- +# Citation finders — return {para: [(start, end, anchor), ...]} +# --------------------------------------------------------------------------- + +def _find_citations_bracket(doc: Document) -> dict: + """Find [n] and [n,m] citations and map them to xref_B* anchors.""" + result: dict = {} + pattern = re.compile(r'\[(\d+(?:[,\-]\d+)*)\]') + + for para in _body_paragraphs(doc): + text = para.text + spans = [] + for m in pattern.finditer(text): + numbers = _expand_range(m.group(1)) + for n in numbers: + anchor = f"{BOOKMARK_PREFIX}{n}" + spans.append((m.start(), m.end(), anchor, m.group(0))) + if spans: + result[para] = spans + return result + + +def _find_citations_superscript(doc: Document) -> dict: + """Find superscript-number citations and map them to xref_B* anchors.""" + result: dict = {} + + for para in _body_paragraphs(doc): + spans = [] + pos = 0 + for run in para.runs: + run_text = run.text + run_end = pos + len(run_text) + if run.font.superscript and re.fullmatch(r'[\d,\s\-]+', run_text.strip()): + # Strip leading/trailing commas that Word sometimes includes + # in the same superscript run as punctuation separators. + clean = run_text.strip().strip(',').strip() + numbers = _expand_range(clean.replace(" ", "")) + for n in numbers: + anchor = f"{BOOKMARK_PREFIX}{n}" + spans.append((pos, run_end, anchor, clean)) + pos = run_end + if spans: + result[para] = spans + return result + + +def _find_citations_abnt(doc: Document, ref_index: list) -> dict: + """ + Find ABNT citations in both forms and match against ref_index: + - Parenthetical: (Author, 2020) or (Author et al., 2020; Author2, 2021) + - Narrative: Author (2020) or Author et al. (2020) or Author and Author (2020) + """ + result: dict = {} + paren_re = re.compile( + r'\(([A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇ][^\(\)]{2,100}\d{4}[^\(\)]*)\)', + re.UNICODE, + ) + year_re = re.compile(r'\b(1[89]\d{2}|20\d{2})\b') + + # Surname token: handles hyphen-compounds with optional space (e.g. "Ilkiu-Borges" or "Ilkiu -Borges") + _sname = ( + r'[A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒ][A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒa-záéíóúàâêôãõüçäöïëøåæœ]+' + r'(?:\s*-\s*[A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒa-záéíóúàâêôãõüçäöïëøåæœ]+)*' + ) + narrative_re = re.compile( + r'(' + _sname + r'(?:\s+(?:and|&)\s+' + _sname + r')*(?:\s+et\s+al\.)?)' + r'\s*\((\d{4}[a-z]?(?:,\s*\d{4}[a-z]?)*)\)', + re.UNICODE, + ) + + for para in _body_paragraphs(doc): + text = para.text + spans = [] + covered: set[tuple[int, int]] = set() + + # 1. Parenthetical citations: (Author, year) — split on ";" for multiple + for m in paren_re.finditer(text): + inner = m.group(1) + parts = [p.strip() for p in inner.split(";")] + for part in parts: + year_m = year_re.search(part) + if not year_m: + continue + year = year_m.group(1) + surname_m = re.match(r'([A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇ][^\s,]+)', part) + if not surname_m: + continue + surname = _normalize(surname_m.group(1)) + anchor = _match_abnt(surname, year, ref_index) + if anchor and (m.start(), m.end()) not in covered: + spans.append((m.start(), m.end(), anchor, m.group(0))) + covered.add((m.start(), m.end())) + + # 2. Narrative citations: Author (year) — not already covered by parenthetical + for m in narrative_re.finditer(text): + if (m.start(), m.end()) in covered: + continue + author_part = m.group(1).strip() + years_str = m.group(2) + # Extract first surname (strip et al. first) + author_clean = re.sub(r'\s+et\s+al\.', '', author_part) + first_token = re.match(r'([^\s]+)', author_clean) + if not first_token: + continue + surname = _normalize(first_token.group(1)) + # Try each year in the citation until one matches (handles "Author (1976, 1984, 1985)") + anchor = None + for yr in re.findall(r'\d{4}[a-z]?', years_str): + anchor = _match_abnt(surname, yr, ref_index) + if anchor: + break + if anchor and (m.start(), m.end()) not in covered: + spans.append((m.start(), m.end(), anchor, m.group(0))) + covered.add((m.start(), m.end())) + + if spans: + result[para] = spans + return result + + +def _match_abnt(surname: str, year: str, ref_index: list) -> str | None: + """Return xref_Bn for the best match, or None.""" + skey = surname[:5] + year_plain = year[:4] + # Exact match first (preserves 2004a vs 2004b disambiguation) + for n, first_author, ref_year, _ in ref_index: + if ref_year == year and first_author.startswith(skey): + return f"{BOOKMARK_PREFIX}{n}" + # Fallback: compare first 4 chars (handles refs stored without suffix) + for n, first_author, ref_year, _ in ref_index: + if ref_year[:4] == year_plain and first_author.startswith(skey): + return f"{BOOKMARK_PREFIX}{n}" + return None + + +def _expand_range(token: str) -> list[int]: + """'3,5' → [3,5]; '7-9' → [7,8,9]; '2' → [2].""" + numbers = [] + for part in token.split(","): + part = part.strip() + if "-" in part: + a, b = part.split("-", 1) + try: + numbers.extend(range(int(a), int(b) + 1)) + except ValueError: + pass + else: + try: + numbers.append(int(part)) + except ValueError: + pass + return numbers + + +# --------------------------------------------------------------------------- +# XML manipulation +# --------------------------------------------------------------------------- + +def _next_bookmark_id(doc: Document) -> int: + """Return an id value safe to use for new bookmarks.""" + existing = re.findall(r'w:id="(\d+)"', doc.element.xml) + return max((int(i) for i in existing), default=0) + 1 + + +def _add_bookmark_to_para(para, name: str, bk_id: int): + """Wrap the paragraph content in a named bookmark.""" + p = para._p + + bk_start = OxmlElement("w:bookmarkStart") + bk_start.set(qn("w:id"), str(bk_id)) + bk_start.set(qn("w:name"), name) + + bk_end = OxmlElement("w:bookmarkEnd") + bk_end.set(qn("w:id"), str(bk_id)) + + p.insert(0, bk_start) + p.append(bk_end) + + +def _insert_hyperlinks(para, spans: list): + """ + Replace citation text in *para* with internal hyperlinks. + + *spans* is a list of (start, end, anchor, original_text) tuples, where + start/end are character offsets in ``para.text``. + Multiple citations pointing to the same span are merged into separate + hyperlinks inserted consecutively. + """ + # Deduplicate spans on (start, end) keeping first match only + seen = set() + unique_spans = [] + for span in sorted(spans, key=lambda s: s[0]): + key = (span[0], span[1]) + if key not in seen: + seen.add(key) + unique_spans.append(span) + + # Rebuild paragraph XML run-by-run, inserting hyperlinks at citation positions + p = para._p + full_text = para.text + + # Collect (run_element, run_start, run_end) from current runs + run_segments = [] + pos = 0 + for child in p: + tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag + if tag == "r": + t_elem = child.find(qn("w:t")) + text = t_elem.text if t_elem is not None and t_elem.text else "" + run_segments.append((child, pos, pos + len(text))) + pos += len(text) + elif tag == "hyperlink": + # Already a hyperlink — count its text length + inner_text = "".join( + (t.text or "") for t in child.iter(qn("w:t")) + ) + run_segments.append((child, pos, pos + len(inner_text))) + pos += len(inner_text) + + if not run_segments: + return + + # Build list of "what goes where" in character-offset order + # Each item: ('run', elem) or ('hyperlink', anchor, text, template_run) + events = [] # (char_offset, type, ...) + + # Mark citation zones + citation_zones = {(s, e): (anchor, txt) for s, e, anchor, txt in unique_spans} + + offset = 0 + seg_idx = 0 + while seg_idx < len(run_segments) and offset < len(full_text): + elem, seg_start, seg_end = run_segments[seg_idx] + tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag + + # Check if a citation zone starts here + matched_zone = None + for (z_start, z_end), (anchor, cit_text) in citation_zones.items(): + if seg_start <= z_start < seg_end or (z_start <= seg_start < z_end): + matched_zone = (z_start, z_end, anchor, cit_text) + break + + if matched_zone is None or tag == "hyperlink": + events.append(("keep", elem)) + seg_idx += 1 + continue + + z_start, z_end, anchor, cit_text = matched_zone + + # Split first run: extract text before the citation starts + t_node = elem.find(qn("w:t")) + run_text = t_node.text if t_node is not None and t_node.text else "" + before = run_text[:max(0, z_start - seg_start)] + if before: + r_before = copy.deepcopy(elem) + r_before.find(qn("w:t")).text = before + events.append(("keep", r_before)) + + # Advance seg_idx to the last run that overlaps this zone. + # Citations like "Costa et al. (2020)" span multiple runs; + # without this loop only the first run portion would be hyperlinked. + while seg_idx + 1 < len(run_segments) and run_segments[seg_idx + 1][1] < z_end: + seg_idx += 1 + + # Extract text after the citation ends from the last run in the zone + last_elem, last_start, _ = run_segments[seg_idx] + last_t = last_elem.find(qn("w:t")) + last_text = last_t.text if last_t is not None and last_t.text else "" + after = last_text[max(0, z_end - last_start):] + + # Emit hyperlink with full citation text (first run used as style template) + events.append(("hyperlink", anchor, cit_text, elem)) + + if after: + r_after = copy.deepcopy(last_elem) + t = r_after.find(qn("w:t")) + if t is not None: + t.text = after + if after.startswith(" ") or after.endswith(" "): + t.set("{http://www.w3.org/XML/1998/namespace}space", "preserve") + events.append(("keep", r_after)) + + del citation_zones[(z_start, z_end)] + seg_idx += 1 + + # Remaining segments + for elem, _, _ in run_segments[seg_idx:]: + events.append(("keep", elem)) + + # Remove old run/hyperlink children from paragraph + for elem, _, _ in run_segments: + if elem in p: + p.remove(elem) + + # Re-insert in order + insert_pos = 0 + # Find insertion point (after pPr if present) + ppr = p.find(qn("w:pPr")) + insert_after = ppr if ppr is not None else None + + for event in events: + if event[0] == "keep": + elem = event[1] + if insert_after is not None: + insert_after.addnext(elem) + insert_after = elem + else: + p.insert(insert_pos, elem) + insert_pos += 1 + else: + _, anchor, cit_text, template_run = event + hl = _make_hyperlink(anchor, cit_text, template_run) + if insert_after is not None: + insert_after.addnext(hl) + insert_after = hl + else: + p.insert(insert_pos, hl) + insert_pos += 1 + + +def build_text_xref_replacer(doc: Document): + """ + Build a callable that tags 'Author (year)' narrative citations with . + Builds the reference lookup directly from the reference list section in *doc*, + assigning B1..Bn by position (consistent with read_marks / xml.py convention). + Returns: apply(text: str) -> str + """ + refs = _find_references_section(doc) + ref_list = [ + {'rid': f'B{i + 1}', 'ref_text': para.text.strip()} + for i, (_, para) in enumerate(refs) + ] + return _make_text_xref_fn(ref_list) + + +def make_text_xref_fn_from_refs(ref_items: list): + """ + Build a narrative xref replacer from reference dicts with keys: + {'rid'|'refid': 'Bn', 'ref_text'|'paragraph': '...'}. + Returns: apply(text: str) -> str + """ + return _make_text_xref_fn(ref_items) + + +def _make_text_xref_fn(ref_list: list): + """Build the 'Author (year)' replacer function from a list of reference dicts.""" + # Year regex includes optional letter suffix (e.g. 2004a, 2004b) + _year_re = re.compile(r'\b((?:1[89]|20)\d{2}[a-z]?)\b') + # Tuples: (skey, year_with_suffix, rid, full_ref_text_normalized) for compound-author lookup + ref_entries: list[tuple[str, str, str, str]] = [] + # Simple primary lookup: first match wins + ref_lookup: dict[tuple[str, str], str] = {} + + for i, item in enumerate(ref_list): + rid = item.get('rid') or item.get('refid') or f'B{i + 1}' + text = item.get('ref_text') or item.get('paragraph') or '' + if not text: + continue + skey = _normalize(_first_surname(text))[:5] + norm_text = _normalize(text) + for year in _year_re.findall(text)[:4]: + ref_entries.append((skey, year, rid, norm_text)) + if (skey, year) not in ref_lookup: + ref_lookup[(skey, year)] = rid + + if not ref_entries: + return lambda t: t + + def _lookup(skey: str, year: str, extra_skeys: list[str]) -> str | None: + """Find best rid: prefer entries containing all author surnames.""" + candidates = [(rid, norm) for s, y, rid, norm in ref_entries if s == skey and y == year] + if not candidates: + return None + if len(candidates) == 1 or not extra_skeys: + return candidates[0][0] + # Prefer candidate whose text contains the extra authors + for rid, norm in candidates: + if all(sk in norm for sk in extra_skeys): + return rid + return candidates[0][0] + + # Reusable surname token: handles "Ilkiu-Borges" and "Ilkiu -Borges" (space before hyphen) + _sname = ( + r'[A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒ][A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒa-záéíóúàâêôãõüçäöïëøåæœ]+' + r'(?:\s*-\s*[A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒa-záéíóúàâêôãõüçäöïëøåæœ]+)*' + ) + # et al. can appear as plain text or wrapped in tags + _etal = r'(?:\s+(?:et\s+al\.|et\s+al\.?\.?))?' + # Match: Surname [and/& Surname]* [et al.] (year[a-z]?[, year[a-z]?]*) + _narrative_re = re.compile( + r'(' + _sname + r'(?:\s+(?:and|&)\s+' + _sname + r')*' + _etal + r')' + r'\s*\((\d{4}[a-z]?(?:,\s*\d{4}[a-z]?)*)\)', + re.UNICODE, + ) + _split_re = re.compile(r'(]*>.*?)', re.DOTALL) + _etal_strip = re.compile(r'\s+(?:et\s+al\.|et\s+al\.?\.?)') + + def _replace(m: re.Match) -> str: + full = m.group(0) + author_part = m.group(1).strip() + years_str = m.group(2) + # Remove et al., then split on and/& to get individual author tokens + author_clean = _etal_strip.sub('', author_part) + author_tokens = re.split(r'\s+(?:and|&)\s+', author_clean) + skeys = [_normalize(t.split()[0])[:5] for t in author_tokens if t.strip()] + if not skeys: + return full + primary_skey = skeys[0] + extra_skeys = skeys[1:] + rids: list[str] = [] + for year in re.findall(r'\d{4}[a-z]?', years_str): + rid = _lookup(primary_skey, year, extra_skeys) + if rid and rid not in rids: + rids.append(rid) + if not rids: + return full + return f'{full}' + + _paren_inner_re = re.compile( + r'\(([A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒ][^\(\)]{2,200}\d{4}[^\(\)]*)\)', + re.UNICODE, + ) + _paren_year_re = re.compile(r'\b(1[89]\d{2}|20\d{2})\b') + _paren_author_re = re.compile(r'([A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒ][^\s,;]+)') + + def _replace_paren(m: re.Match) -> str: + full = m.group(0) + inner = m.group(1) + parts = [p.strip() for p in re.split(r'[;,]\s*(?=[A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒ])', inner)] + rids: list[str] = [] + for part in parts: + yr_m = _paren_year_re.search(part) + if not yr_m: + continue + au_m = _paren_author_re.match(part) + if not au_m: + continue + skey = _normalize(au_m.group(1))[:5] + rid = _lookup(skey, yr_m.group(1), []) + if rid and rid not in rids: + rids.append(rid) + if not rids: + return full + return f'{full}' + + def apply(text: str) -> str: + if not text: + return text + parts = _split_re.split(text) + result = [] + for idx, part in enumerate(parts): + if idx % 2 != 0: + result.append(part) + continue + # Narrative first, then parenthetical on remaining non-xref text + part = _narrative_re.sub(_replace, part) + sub_parts = _split_re.split(part) + out = [] + for i, sp in enumerate(sub_parts): + if i % 2 != 0: + out.append(sp) + else: + out.append(_paren_inner_re.sub(_replace_paren, sp)) + result.append(''.join(out)) + return ''.join(result) + + return apply + + +def _make_hyperlink(anchor: str, text: str, template_run) -> object: + """Create a element.""" + hl = OxmlElement("w:hyperlink") + hl.set(qn("w:anchor"), anchor) + + r = copy.deepcopy(template_run) + # Ensure rPr exists and add Hyperlink style + rpr = r.find(qn("w:rPr")) + if rpr is None: + rpr = OxmlElement("w:rPr") + r.insert(0, rpr) + style_elem = OxmlElement("w:rStyle") + style_elem.set(qn("w:val"), "Hyperlink") + rpr.insert(0, style_elem) + + t = r.find(qn("w:t")) + if t is not None: + t.text = text + if text.startswith(" ") or text.endswith(" "): + t.set("{http://www.w3.org/XML/1998/namespace}space", "preserve") + + hl.append(r) + return hl \ No newline at end of file From d104670d3ebcd2b0429f20e71103ac82a476349a Mon Sep 17 00:00:00 2001 From: Rossi-Luciano Date: Sun, 31 May 2026 18:52:33 -0300 Subject: [PATCH 02/11] Corrige bugs em labeling_utils e function_docx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit labeling_utils.py: - Regex de detecção da seção de referências mais precisa (refer[eê]nci|references?) - resp_json inicializado antes do bloco condicional (evitava UnboundLocalError) - Escapa '<' literal em append_fragment para não quebrar o parser XML - Guarda None em proccess_special_content para search_special_id retornando None function_docx.py: - is_numPr inicializado antes do loop (tabelas após listas eram descartadas silenciosamente por flag herdada da iteração anterior) - Parágrafos adicionados a content independente de tabelas adjacentes Co-Authored-By: Claude Sonnet 4.6 --- markup_doc/labeling_utils.py | 44 ++++++++++++++++++++---------------- markuplib/function_docx.py | 4 ++++ 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/markup_doc/labeling_utils.py b/markup_doc/labeling_utils.py index f66d3d6..4ff3dbe 100644 --- a/markup_doc/labeling_utils.py +++ b/markup_doc/labeling_utils.py @@ -773,6 +773,17 @@ def create_labeled_object2(i, item, state, sections): obj["type"] = "paragraph" obj["value"] = {"label": state["label"], "paragraph": item.get("text")} + if state.get("body") and re.search( + r"^(refer[eê]nci|references?)\s*$", item.get("text").strip().lower() + ): + state["label"] = "" + state["body"] = False + state["back"] = True + result = {"label": "", "body": False, "back": True} + obj["type"] = "paragraph" + obj["value"] = {"label": state["label"], "paragraph": item.get("text")} + + if not result: result = {"label": "

", "body": state["body"], "back": state["back"]} state["label"] = result.get("label") @@ -874,12 +885,12 @@ def get_data_first_block(text, metadata, user_id): "Content-Type": "application/json", } + resp_json = {} response = requests.post(url, json=payload, headers=headers) if response.status_code == 200: response_json = response.json() message_str = response_json["message"] - resp_json = json.loads(message_str) return resp_json @@ -1279,6 +1290,7 @@ def append_fragment(node_dest, val): clean = escape_angle_brackets_outside_tags(clean) clean = remove_unpaired_tags(clean) + clean = re.sub(r'<(?![/a-zA-Z_])', '<', clean) if clean == "": parent = node_dest.getparent() @@ -1351,23 +1363,17 @@ def proccess_special_content(text, data_body): res = [] dict_type = {"f": "fig", "t": "table", "e": "disp-formula"} - try: - for match in re.finditer( - pattern, text, re.IGNORECASE | re.UNICODE | re.VERBOSE - ): - label = match.group(0) - - id = search_special_id(data_body, label) - - res.append( - { - "label": label, - "id": id, - "reftype": dict_type.get(id[0].lower(), "other"), - } - ) - except Exception as exc: - print(f"ERROR proccess_special_content: {exc}") - pass + for match in re.finditer(pattern, text, re.IGNORECASE | re.UNICODE | re.VERBOSE): + label = match.group(0) + id = search_special_id(data_body, label) + if id is None: + continue + res.append( + { + "label": label, + "id": id, + "reftype": dict_type.get(id[0].lower(), "other"), + } + ) return res diff --git a/markuplib/function_docx.py b/markuplib/function_docx.py index f92c31d..c9646cd 100644 --- a/markuplib/function_docx.py +++ b/markuplib/function_docx.py @@ -339,6 +339,10 @@ def extrae_Tabla(element, rels_map, namespaces): is_numPr = False if isinstance(element, CT_P): obj = {} + paragraph = element + text_paragraph = [] + _ns_w = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} + is_numPr = paragraph.find('.//w:numPr', namespaces=_ns_w) is not None namespaces = { "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", From 4ca261d3be380f801aaea40941d5af48bba93343 Mon Sep 17 00:00:00 2001 From: Rossi-Luciano Date: Sun, 31 May 2026 18:52:56 -0300 Subject: [PATCH 03/11] =?UTF-8?q?Adiciona=20bot=C3=A3o=20Reprocessar=20na?= =?UTF-8?q?=20interface=20Wagtail?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Insere botão na barra de ações do cabeçalho nas views de edição de ProcessedDocx e MarkupXML, com mensagem de confirmação contextual. Usa MutationObserver como fallback para injeção no DOM quando o cabeçalho ainda não está renderizado. Co-Authored-By: Claude Sonnet 4.6 --- markup_doc/static/js/xref-button.js | 70 ++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/markup_doc/static/js/xref-button.js b/markup_doc/static/js/xref-button.js index 5950941..2b64935 100644 --- a/markup_doc/static/js/xref-button.js +++ b/markup_doc/static/js/xref-button.js @@ -523,8 +523,76 @@ function get_zip() { // También en DOMContentLoaded por si basta document.addEventListener("DOMContentLoaded", tryAttach); - + // Llama una vez por si ya está listo tryAttach(); })(); + +// Botão Reprocessar — aparece nas views de edição de ProcessedDocx e MarkupXML +(function () { + var path = window.location.pathname; + var isProcessedDocx = path.indexOf('processeddocx/edit/') !== -1; + var isMarkupXml = path.indexOf('markupxml/edit/') !== -1; + + if (!isProcessedDocx && !isMarkupXml) return; + + var match = path.match(/\/edit\/(\d+)\//); + if (!match) return; + var pk = match[1]; + + function makeBtn() { + var btn = document.createElement('button'); + btn.type = 'button'; + btn.id = 'reprocess-btn'; + btn.textContent = 'Reprocessar'; + btn.style.cssText = [ + 'padding:4px 12px', + 'cursor:pointer', + 'background:#e9a000', + 'color:white', + 'font-weight:bold', + 'border:none', + 'border-radius:4px', + 'margin-left:8px', + 'font-size:14px', + ].join(';'); + btn.addEventListener('mouseover', function () { btn.style.background = '#c98000'; }); + btn.addEventListener('mouseout', function () { btn.style.background = '#e9a000'; }); + btn.addEventListener('click', function () { + var msg = isMarkupXml + ? 'Isso irá descartar as edições manuais e reprocessar o DOCX original. Continuar?' + : 'Reprocessar este documento?'; + if (confirm(msg)) { + window.location.href = '/admin/reprocess/' + pk + '/'; + } + }); + return btn; + } + + function tryInsert() { + if (document.getElementById('reprocess-btn')) return true; + // Tenta área de ações do cabeçalho Wagtail (v5/v6) + var actionArea = document.querySelector('.w-slim-header__action-buttons') + || document.querySelector('[data-controller="w-slim-header"] .w-slim-header__title-wrapper'); + if (actionArea) { + actionArea.appendChild(makeBtn()); + return true; + } + // Fallback: insere após o primeiro botão submit (salvar) + var saveBtn = document.querySelector('button[type="submit"]'); + if (saveBtn && saveBtn.parentNode) { + saveBtn.parentNode.insertBefore(makeBtn(), saveBtn.nextSibling); + return true; + } + return false; + } + + if (!tryInsert()) { + var obs = new MutationObserver(function () { + if (tryInsert()) obs.disconnect(); + }); + obs.observe(document.documentElement, { childList: true, subtree: true }); + } +})(); + From 5fd4849b6abbbaed58cd9bacc59b2ed9f250e17e Mon Sep 17 00:00:00 2001 From: Rossi-Luciano Date: Mon, 1 Jun 2026 09:12:48 -0300 Subject: [PATCH 04/11] Corrige IndentationError no bloco do abstract em xml.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remoção de guarda duplicada `if vals2 and vals2[0]:` que ficou sem corpo após resolução de conflito de rebase, causando IndentationError. Co-Authored-By: Claude Sonnet 4.6 --- markup_doc/xml.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/markup_doc/xml.py b/markup_doc/xml.py index e94ccf8..18ed8bc 100644 --- a/markup_doc/xml.py +++ b/markup_doc/xml.py @@ -477,8 +477,6 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None): append_fragment(node_tmp2, vals[0].value.get("paragraph")) if vals2 and vals2[0]: - - if vals2: # Encuentra su índice original en article_docx.content last_index = data_t.index(vals2[0]) From 2d9cb26a2be7b877bd034d9d688326af2ac665db Mon Sep 17 00:00:00 2001 From: Rossi-Luciano Date: Mon, 1 Jun 2026 09:18:45 -0300 Subject: [PATCH 05/11] Corrige KeyError em reftype/refid nos blocos book e thesis de xml.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dois acessos diretos a values["reftype"] sobreviveram à resolução de conflito do rebase. Substituídos por values.get("reftype") para não lançar KeyError quando o LLM não parseia o tipo da referência. Co-Authored-By: Claude Sonnet 4.6 --- markup_doc/xml.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/markup_doc/xml.py b/markup_doc/xml.py index 18ed8bc..0824ec9 100644 --- a/markup_doc/xml.py +++ b/markup_doc/xml.py @@ -916,7 +916,7 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None): node_elem = etree.SubElement( node_ref, "element-citation", - attrib={"publication-type": values["reftype"]}, + attrib={"publication-type": values.get("reftype")}, ) node_person = etree.SubElement( node_elem, "person-group", attrib={"person-group-type": "author"} @@ -1060,11 +1060,11 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None): ) append_fragment(etree.SubElement(node_ref, "page"), values["pages"]) - if values["reftype"] == "thesis": + if values.get("reftype") == "thesis": node_elem = etree.SubElement( node_ref, "element-citation", - attrib={"publication-type": values["reftype"]}, + attrib={"publication-type": values.get("reftype")}, ) node_person = etree.SubElement( node_elem, "person-group", attrib={"person-group-type": "author"} From 807e17d9bae0f276e4a4057a88cae54535f5beae Mon Sep 17 00:00:00 2001 From: Rossi-Luciano Date: Mon, 1 Jun 2026 09:44:34 -0300 Subject: [PATCH 06/11] =?UTF-8?q?Corrige=20substitui=C3=A7=C3=A3o=20de=20x?= =?UTF-8?q?ref=20dentro=20de=20atributos=20em=20tasks.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit O passe de xref_map em stream_data_body usava para.replace() cru, sem segmentação. Para Vancouver superscript, chaves curtas como "2" substituíam o dígito dentro de rid="B2" já criado, corrompendo o atributo: rid="B2". Substituído por _apply_xref_map_safe() com a mesma lógica segmentada de xml.py: divide nos limites de existentes e aplica o replace apenas nos segmentos de texto puro. Co-Authored-By: Claude Sonnet 4.6 --- markup_doc/tasks.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/markup_doc/tasks.py b/markup_doc/tasks.py index b4f3988..167f021 100644 --- a/markup_doc/tasks.py +++ b/markup_doc/tasks.py @@ -443,19 +443,33 @@ def get_labels(article_id, user_id): stream_data = [item for item in stream_data if item not in rescued] # Apply xref_map (DOCX hyperlinks) and narrative Author (year) xrefs to body. + # Use segment-based replacement to avoid substituting inside already-created + # tags (e.g. short Vancouver superscript keys like "2" would otherwise + # corrupt rid="B2" attribute values on the second iteration of the loop). + _xref_split_re = re.compile(r'(]*>.*?)', re.DOTALL) + + def _apply_xref_map_safe(text, xmap): + parts = _xref_split_re.split(text) + result = [] + for i, part in enumerate(parts): + if i % 2 != 0: + result.append(part) + continue + for cit_text, rid in sorted(xmap.items(), key=lambda x: -len(x[0])): + part = part.replace( + cit_text, + f'{cit_text}', + ) + result.append(part) + return ''.join(result) + for item in stream_data_body: if item.get('value', {}).get('label') == '

': para = item['value'].get('paragraph', '') or '' if not para: continue - # 1. Dict-based from DOCX hyperlinks if xref_map: - for cit_text, rid in sorted(xref_map.items(), key=lambda x: -len(x[0])): - para = para.replace( - cit_text, - f'{cit_text}', - ) - # 2. Narrative "Author (year)" citations + para = _apply_xref_map_safe(para, xref_map) para = text_xref_fn(para) item['value']['paragraph'] = para From 74951f89e11b4c7ca8211c71c24394b108e23c15 Mon Sep 17 00:00:00 2001 From: Rossi-Luciano Date: Mon, 1 Jun 2026 09:49:49 -0300 Subject: [PATCH 07/11] =?UTF-8?q?Corrige=20corrup=C3=A7=C3=A3o=20de=20atri?= =?UTF-8?q?butos=20xref=20em=20substitui=C3=A7=C3=B5es=20sequenciais?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit O bug: _apply_xref_map acumulava replacements dentro do mesmo segmento. Após substituir "20" → 20, a iteração seguinte substituía "2" dentro do próprio rid="B20" já criado, produzindo rid="B20" — XML inválido. Fix: um _apply_to_segments por citação (não um for-loop dentro do segmento). A cada iteração, os criados tornam-se fronteiras que protegem as iterações seguintes. Aplicado em tasks.py (_apply_xref_map_safe) e xml.py (_apply_xref_map). Co-Authored-By: Claude Sonnet 4.6 --- markup_doc/tasks.py | 25 ++++++++++++------------- markup_doc/xml.py | 22 +++++++++++++--------- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/markup_doc/tasks.py b/markup_doc/tasks.py index 167f021..08c1932 100644 --- a/markup_doc/tasks.py +++ b/markup_doc/tasks.py @@ -448,20 +448,19 @@ def get_labels(article_id, user_id): # corrupt rid="B2" attribute values on the second iteration of the loop). _xref_split_re = re.compile(r'(]*>.*?)', re.DOTALL) - def _apply_xref_map_safe(text, xmap): + def _apply_to_seg(text, fn): parts = _xref_split_re.split(text) - result = [] - for i, part in enumerate(parts): - if i % 2 != 0: - result.append(part) - continue - for cit_text, rid in sorted(xmap.items(), key=lambda x: -len(x[0])): - part = part.replace( - cit_text, - f'{cit_text}', - ) - result.append(part) - return ''.join(result) + return ''.join(fn(p) if i % 2 == 0 else p for i, p in enumerate(parts)) + + def _apply_xref_map_safe(text, xmap): + # Apply one citation at a time so that tags created by earlier + # iterations become boundaries for later, shorter keys. + for cit_text, rid in sorted(xmap.items(), key=lambda x: -len(x[0])): + replacement = f'{cit_text}' + text = _apply_to_seg( + text, lambda seg, ct=cit_text, r=replacement: seg.replace(ct, r) + ) + return text for item in stream_data_body: if item.get('value', {}).get('label') == '

': diff --git a/markup_doc/xml.py b/markup_doc/xml.py index 0824ec9..ef70bbb 100644 --- a/markup_doc/xml.py +++ b/markup_doc/xml.py @@ -28,15 +28,19 @@ def _apply_to_segments(text, fn): def _apply_xref_map(paragraph, xref_map): - """Apply xref_map replacements segment-by-segment to avoid double-wrapping.""" - def replace_in_segment(seg): - for cit_text, rid in sorted(xref_map.items(), key=lambda x: -len(x[0])): - seg = seg.replace( - cit_text, - f'{cit_text}', - ) - return seg - return _apply_to_segments(paragraph, replace_in_segment) + """Apply xref_map replacements one citation at a time. + + Each citation is applied via a fresh _apply_to_segments pass so that + tags created by earlier iterations are respected as boundaries + for later, shorter keys (e.g. "20" is replaced first, then "2" must + not touch the already-created rid="B20" attribute). + """ + for cit_text, rid in sorted(xref_map.items(), key=lambda x: -len(x[0])): + replacement = f'{cit_text}' + paragraph = _apply_to_segments( + paragraph, lambda seg, ct=cit_text, r=replacement: seg.replace(ct, r) + ) + return paragraph def _apply_proccess_labeled_text(paragraph, data_back): From 71aaca6f94da55ca14397a713902c2e99b6623fa Mon Sep 17 00:00:00 2001 From: Rossi-Luciano Date: Mon, 1 Jun 2026 09:54:01 -0300 Subject: [PATCH 08/11] Adiciona guards .get() em acessos a figid, figlabel, tabid, eid em xml.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Artigos com figuras sem figlabel levantavam KeyError. Substituídos por .get() com fallback vazio para todos os campos opcionais de elementos especiais (fig, table, formula). Co-Authored-By: Claude Sonnet 4.6 --- markup_doc/xml.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/markup_doc/xml.py b/markup_doc/xml.py index ef70bbb..d593a75 100644 --- a/markup_doc/xml.py +++ b/markup_doc/xml.py @@ -654,7 +654,7 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None): node_list.append(child) if d["value"]["label"] == "" or d["value"]["label"] == "": - attrib = {"id": d["value"]["tabid"]} + attrib = {"id": d["value"].get("tabid", "")} if subsec: node_p = etree.SubElement(node_sec, "p") @@ -694,7 +694,7 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None): append_fragment(node_fnp, d["value"]["paragraph"]) if d["value"]["label"] == "": - attrib = {"id": d["value"]["figid"]} + attrib = {"id": d["value"].get("figid", "")} if subsec: node_p = etree.SubElement(node_sec, "p") @@ -703,11 +703,9 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None): node_p = etree.SubElement(node, "p") node_fig = etree.SubElement(node_p, "fig", attrib=attrib) - etree.SubElement(node_fig, "label").text = d["value"]["figlabel"] + etree.SubElement(node_fig, "label").text = d["value"].get("figlabel") node_caption = etree.SubElement(node_fig, "caption") - etree.SubElement(node_caption, "title").text = ( - d["value"]["title"] if "title" in d["value"] else None - ) + etree.SubElement(node_caption, "title").text = d["value"].get("title") Image = get_image_model() image_id = d["value"]["image"] @@ -728,7 +726,7 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None): append_fragment(node_attrib, d["value"]["paragraph"]) if d["value"]["label"] == "": - attrib = {"id": d["value"]["eid"]} + attrib = {"id": d["value"].get("eid", "")} if subsec: node_p = etree.SubElement(node_sec, "p") @@ -745,7 +743,7 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None): append_fragment(node_f, c["value"]) if d["value"]["label"] == "": - attrib = {"id": d["value"]["eid"]} + attrib = {"id": d["value"].get("eid", "")} if subsec: node_p = etree.SubElement(node_sec, "p") From 57bcbbc18fe4574f88c2833e3872f8c600c7ae99 Mon Sep 17 00:00:00 2001 From: Rossi-Luciano Date: Mon, 1 Jun 2026 09:56:07 -0300 Subject: [PATCH 09/11] Corrige KeyError em search_special_id para figid/figlabel/tabid/tablabel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Figuras e tabelas sem rótulo levantavam KeyError em search_special_id. Substituídos por .get() com fallback vazio e guards de comprimento antes de indexar figid/tabid. Co-Authored-By: Claude Sonnet 4.6 --- markup_doc/labeling_utils.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/markup_doc/labeling_utils.py b/markup_doc/labeling_utils.py index 4ff3dbe..af790ff 100644 --- a/markup_doc/labeling_utils.py +++ b/markup_doc/labeling_utils.py @@ -1028,22 +1028,30 @@ def search_special_id(data_body, label): clean_label = re.sub(r"^[\s\.,;:–—-]+", "", label).capitalize() if d["type"] == "image": - if clean_label == data["figlabel"]: - return data.get("figid") + figlabel = data.get("figlabel") or "" + figid = data.get("figid") or "" + if clean_label == figlabel: + return figid or None if ( - data["figid"][0] == clean_label.lower()[0] - and data["figid"][1] in clean_label.lower() + figid + and len(figid) > 1 + and figid[0] == clean_label.lower()[:1] + and figid[1] in clean_label.lower() ): - return data.get("figid") + return figid if d["type"] == "table": - if clean_label == data["tablabel"]: - return data.get("tabid") + tablabel = data.get("tablabel") or "" + tabid = data.get("tabid") or "" + if clean_label == tablabel: + return tabid or None if ( - data["tabid"][0] == clean_label.lower()[0] - and data["tabid"][1] in clean_label.lower() + tabid + and len(tabid) > 1 + and tabid[0] == clean_label.lower()[:1] + and tabid[1] in clean_label.lower() ): - return data.get("tabid") + return tabid for d in data_body: if d["type"] in ["compound_paragraph"]: From cb444dd5e99a8f849630866196fffb48d6522c9a Mon Sep 17 00:00:00 2001 From: Rossi-Luciano Date: Mon, 1 Jun 2026 09:57:41 -0300 Subject: [PATCH 10/11] =?UTF-8?q?Corrige=20AttributeError=20em=20bloco=20=20quando=20regex=20n=C3=A3o=20encontra=20list-type?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fallback para "bullet" quando o parágrafo não contém o padrão esperado. Co-Authored-By: Claude Sonnet 4.6 --- markup_doc/xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/markup_doc/xml.py b/markup_doc/xml.py index d593a75..76f7535 100644 --- a/markup_doc/xml.py +++ b/markup_doc/xml.py @@ -627,7 +627,7 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None): if d["value"]["label"] == "": re_search = re.search(r'list list-type="(.*?)"\]', d["value"]["paragraph"]) - list_type = re_search.group(1) + list_type = re_search.group(1) if re_search else "bullet" attrib = {"list-type": list_type} if subsec: From 30b4b3a948f6999f2c7bda5ec0b705654f73d24c Mon Sep 17 00:00:00 2001 From: Rossi-Luciano Date: Mon, 1 Jun 2026 10:05:39 -0300 Subject: [PATCH 11/11] =?UTF-8?q?Corrige=20AttributeError=20na=20extra?= =?UTF-8?q?=C3=A7=C3=A3o=20de=20conte=C3=BAdo=20de=20lista=20em=20xml.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fallback para string vazia quando o regex de content_list não encontra o padrão [list ...][/list] no parágrafo. Co-Authored-By: Claude Sonnet 4.6 --- markup_doc/xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/markup_doc/xml.py b/markup_doc/xml.py index 76f7535..537efb6 100644 --- a/markup_doc/xml.py +++ b/markup_doc/xml.py @@ -642,7 +642,7 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None): d["value"]["paragraph"], re.DOTALL, ) - content_list = content_list.group(1) + content_list = content_list.group(1) if content_list else "" node_list_text = content_list.replace( "[list-item]", "

" ).replace("[/list-item]", "

")