From 5d1c282359afa3b34d4c6792b5b2bfe11709d853 Mon Sep 17 00:00:00 2001
From: Rossi-Luciano <luciano.rossi.lucross@gmail.com>
Date: Sun, 31 May 2026 18:52:16 -0300
Subject: [PATCH 01/11] =?UTF-8?q?Implementa=20pipeline=20de=20identifica?=
 =?UTF-8?q?=C3=A7=C3=A3o=20e=20inser=C3=A7=C3=A3o=20autom=C3=A1tica=20de?=
 =?UTF-8?q?=20xref=20(DOCX=20=E2=86=92=20SPS=20XML)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adiciona módulo xref.py com detecção de estilo de citação (ABNT narrativo,
ABNT parentético, Vancouver bracket, Vancouver superscript), marcação de
bookmarks e hyperlinks no DOCX, validação de consistência e extração do
mapeamento citação→rid.

Integra o pipeline em tasks.py: o DOCX é marcado antes do processamento
principal, o xref_map é aplicado em stream_data_body e repassado a get_xml().
Ranges Vancouver são expandidos para rid multi-valor (ex: "[26-27]" → "B26 B27").

Em xml.py, substitui o guard coarse `if 'xref' not in paragraph` por
processamento por segmento via _apply_to_segments(), eliminando risco de
double-wrapping e garantindo que citações em parágrafos parcialmente marcados
não sejam ignoradas. O LLM (proccess_labeled_text) passa a atuar como fallback.

Adiciona campos marked_file e xref_status ao modelo ArticleDocxMarkup, com
widgets visuais, proxy model ProcessedDocx e views download_marked_docx e
reprocess para suporte à revisão humana via interface Wagtail.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../0003_articledocxmarkup_marked_file.py     |  18 +
 .../0004_articledocxmarkup_xref_status.py     |  16 +
 markup_doc/models.py                          | 119 ++-
 markup_doc/tasks.py                           | 100 ++-
 markup_doc/views.py                           |  37 +-
 markup_doc/wagtail_hooks.py                   |  20 +
 markup_doc/xml.py                             | 122 ++-
 markup_doc/xref.py                            | 840 ++++++++++++++++++
 8 files changed, 1224 insertions(+), 48 deletions(-)
 create mode 100644 markup_doc/migrations/0003_articledocxmarkup_marked_file.py
 create mode 100644 markup_doc/migrations/0004_articledocxmarkup_xref_status.py
 create mode 100644 markup_doc/xref.py

diff --git a/markup_doc/migrations/0003_articledocxmarkup_marked_file.py b/markup_doc/migrations/0003_articledocxmarkup_marked_file.py
new file mode 100644
index 0000000..509a53e
--- /dev/null
+++ b/markup_doc/migrations/0003_articledocxmarkup_marked_file.py
@@ -0,0 +1,18 @@
+# Generated by Django 6.0.5 on 2026-05-26 14:16
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('markup_doc', '0002_alter_articledocx_estatus_and_more'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='articledocxmarkup',
+            name='marked_file',
+            field=models.FileField(blank=True, null=True, upload_to='uploads_docx_marked/', verbose_name='Marked Document'),
+        ),
+    ]
diff --git a/markup_doc/migrations/0004_articledocxmarkup_xref_status.py b/markup_doc/migrations/0004_articledocxmarkup_xref_status.py
new file mode 100644
index 0000000..c7c1448
--- /dev/null
+++ b/markup_doc/migrations/0004_articledocxmarkup_xref_status.py
@@ -0,0 +1,16 @@
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('markup_doc', '0003_articledocxmarkup_marked_file'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='articledocxmarkup',
+            name='xref_status',
+            field=models.JSONField(blank=True, null=True, verbose_name='XRef Status'),
+        ),
+    ]
\ No newline at end of file
diff --git a/markup_doc/models.py b/markup_doc/models.py
index b3ef32f..ae16222 100644
--- a/markup_doc/models.py
+++ b/markup_doc/models.py
@@ -1,10 +1,13 @@
+import json
+import os
+
 from django import forms
 from django.db import models
 from django.urls import reverse
-from django.utils.html import format_html
+from django.utils.html import format_html, mark_safe
 from django.utils.translation import gettext_lazy as _
 from modelcluster.models import ClusterableModel
-from wagtail.admin.panels import FieldPanel, ObjectList, TabbedInterface
+from wagtail.admin.panels import FieldPanel, ObjectList, Panel, TabbedInterface
 from wagtail.blocks import ChoiceBlock, StreamBlock, StructBlock, TextBlock
 from wagtail.fields import StreamField
 from wagtail.images.blocks import ImageChooserBlock
@@ -25,8 +28,6 @@ class ProcessStatus(models.IntegerChoices):
 class ReadOnlyFileWidget(forms.Widget):
     def render(self, name, value, attrs=None, renderer=None):
         if value:
-            # Muestra el archivo como un enlace de descarga
-            # return format_html('<a href="{}" target="_blank" download>{}</a>', value.url, value.name.split('/')[-1])
             instance = value.instance
             url = reverse("generate_xml", args=[instance.pk])
             return format_html(
@@ -35,6 +36,74 @@ def render(self, name, value, attrs=None, renderer=None):
         return ""
 
 
+class DownloadMarkedFileWidget(forms.Widget):
+    def render(self, name, value, attrs=None, renderer=None):
+        if value:
+            instance = value.instance
+            url = reverse("download_marked_docx", args=[instance.pk])
+            filename = os.path.basename(value.name)
+            return format_html(
+                '<a href="{}" target="_blank" download>{}</a>', url, filename
+            )
+        return mark_safe('<span style="color: gray;">Não disponível ainda</span>')
+
+
+class XrefStatusWidget(forms.Widget):
+    def render(self, name, value, attrs=None, renderer=None):
+        if isinstance(value, str):
+            try:
+                value = json.loads(value)
+            except (json.JSONDecodeError, ValueError):
+                value = None
+        if not value:
+            return mark_safe('<span style="color: gray;">Não processado</span>')
+        valid = value.get("valid", False)
+        total_refs = value.get("total_references", 0)
+        total_cits = value.get("total_citations", 0)
+        orphaned_bk = len(value.get("orphaned_bookmarks", []))
+        orphaned_hl = value.get("orphaned_hyperlinks", [])
+        color = "green" if valid else "red"
+        status = "Válido" if valid else "Inválido"
+        html = format_html(
+            '<p><strong style="color:{};">{}</strong> &nbsp; {} referências | {} citações linkadas</p>',
+            color, status, total_refs, total_cits,
+        )
+        if orphaned_hl:
+            html += format_html(
+                '<p style="color:red;">Citações sem referência: {}</p>',
+                ', '.join(orphaned_hl),
+            )
+        if orphaned_bk:
+            html += format_html(
+                '<p style="color:orange;">{} referência(s) sem citação no texto</p>',
+                orphaned_bk,
+            )
+        return html
+
+
+class ReprocessButtonPanel(Panel):
+    def __init__(self, confirm_message="Reprocessar este documento?", **kwargs):
+        super().__init__(**kwargs)
+        self.confirm_message = confirm_message
+
+    def clone_kwargs(self):
+        return {**super().clone_kwargs(), "confirm_message": self.confirm_message}
+
+    class BoundPanel(Panel.BoundPanel):
+        def render_html(self, parent_context=None):
+            if not self.instance or not self.instance.pk:
+                return ""
+            url = reverse("reprocess", args=[self.instance.pk])
+            msg = self.panel.confirm_message.replace("'", "\\'")
+            return mark_safe(
+                f'<div style="margin:12px 0;">'
+                f'<a href="{url}" onclick="return confirm(\'{msg}\')" '
+                f'style="padding:8px 16px;background:#e9a000;color:white;'
+                f'font-weight:bold;border-radius:4px;text-decoration:none;">'
+                f'Reprocessar</a></div>'
+            )
+
+
 class ArticleDocx(CommonControlField):
     title = models.TextField(_("Document Title"), null=True, blank=True)
     file = models.FileField(
@@ -278,6 +347,17 @@ class ArticleDocxMarkup(CommonControlField, ClusterableModel):
         verbose_name=_("Document"),
         upload_to="uploads_docx/",
     )
+    marked_file = models.FileField(
+        null=True,
+        blank=True,
+        verbose_name=_("Marked Document"),
+        upload_to="uploads_docx_marked/",
+    )
+    xref_status = models.JSONField(
+        _("XRef Status"),
+        null=True,
+        blank=True,
+    )
     estatus = models.IntegerField(
         _("Process estatus"),
         choices=ProcessStatus.choices,
@@ -380,6 +460,16 @@ def __str__(self):
         title = self.title or ""
         return f"{title} | {self.estatus}"
 
+    def get_marked_file_status(self):
+        if not self.marked_file:
+            return "Aguardando processamento"
+        if self.xref_status:
+            total = self.xref_status.get("total_references", 0)
+            cits = self.xref_status.get("total_citations", 0)
+            return f"✓ Disponível ({total} refs, {cits} citações)"
+        return "✓ Disponível"
+    get_marked_file_status.short_description = _("DOCX Marcado")
+
     @property
     def url_download(self):
         return self.file_xml.url if self.file_xml else None
@@ -442,6 +532,9 @@ class MarkupXML(ArticleDocxMarkup):
     panels_xml = [
         FieldPanel("file_xml", widget=ReadOnlyFileWidget()),
         FieldPanel("text_xml"),
+        ReprocessButtonPanel(
+            confirm_message="Isso irá descartar as edições manuais e reprocessar o DOCX original. Continuar?"
+        ),
     ]
 
     panels_details = [
@@ -489,3 +582,21 @@ class MarkupXML(ArticleDocxMarkup):
 
     class Meta:
         proxy = True
+
+
+class ProcessedDocx(ArticleDocxMarkup):
+    panels_doc = [
+        FieldPanel("title"),
+        FieldPanel("marked_file", widget=DownloadMarkedFileWidget()),
+        FieldPanel("xref_status", widget=XrefStatusWidget()),
+        ReprocessButtonPanel(confirm_message="Reprocessar este documento?"),
+    ]
+
+    edit_handler = TabbedInterface([
+        ObjectList(panels_doc, heading=_("DOCX Marcado")),
+    ])
+
+    class Meta:
+        proxy = True
+        verbose_name = _("DOCX processado")
+        verbose_name_plural = _("DOCXs processados")
diff --git a/markup_doc/tasks.py b/markup_doc/tasks.py
index a2e5717..b4f3988 100644
--- a/markup_doc/tasks.py
+++ b/markup_doc/tasks.py
@@ -1,7 +1,9 @@
 # Local application imports
 # Standard library imports
+import io
 import json
 import logging
+import os
 import re
 
 # Third-party imports
@@ -26,6 +28,13 @@
 from markup_doc.models import MarkupXML, ProcessStatus, UploadDocx
 from markup_doc.sync_api import sync_journals_from_api
 from markup_doc.xml import get_xml
+from markup_doc.xref import (
+    build_text_xref_replacer,
+    is_marked,
+    mark_references,
+    read_marks,
+    validate_marks,
+)
 from markuplib.function_docx import functionsDocx
 from model_ai.llama import LlamaInputSettings, LlamaService
 from reference.config_gemini import create_prompt_reference
@@ -87,6 +96,70 @@ def get_labels(article_id, user_id):
         llm_model,
     )
     doc = functionsDocx.openDocx(article_docx.file.path)
+
+    if not is_marked(doc):
+        doc = mark_references(doc)
+
+    xref_validation = validate_marks(doc)
+    if not xref_validation["valid"]:
+        for err in xref_validation["errors"]:
+            print(f"[xref] ERROR: {err}")
+
+    article_docx.xref_status = {
+        "valid": xref_validation["valid"],
+        "total_references": len(xref_validation["bookmarks"]),
+        "total_citations": len(xref_validation["hyperlinks"]),
+        "orphaned_bookmarks": xref_validation["orphaned_bookmarks"],
+        "orphaned_hyperlinks": xref_validation["orphaned_hyperlinks"],
+        "warnings": xref_validation["warnings"],
+        "errors": xref_validation["errors"],
+    }
+
+    ref_marks = read_marks(doc)
+    xref_map = {
+        cit: ref["rid"]
+        for ref in ref_marks
+        for cit in ref["citations"]
+        if cit
+    }
+    # Expand Vancouver range/multi citations to include all rids.
+    # e.g. "[26-27]" linked to B26 should produce rid="B26 B27";
+    # "[3,4,5]" linked to B3 should produce rid="B3 B4 B5".
+    _bracket_re = re.compile(r'^\[(\d+(?:[,\-]\d+)*)\]$')
+    for cit, rid in list(xref_map.items()):
+        m = _bracket_re.match(cit.strip())
+        if not m:
+            continue
+        numbers = []
+        for part in m.group(1).split(','):
+            part = part.strip()
+            if '-' in part:
+                a, b = part.split('-', 1)
+                try:
+                    numbers.extend(range(int(a), int(b) + 1))
+                except ValueError:
+                    pass
+            else:
+                try:
+                    numbers.append(int(part))
+                except ValueError:
+                    pass
+        if len(numbers) > 1:
+            xref_map[cit] = ' '.join(f'B{n}' for n in numbers)
+    italic_variants = {
+        cit.replace("et al.", "<italic>et al.</italic>"): rid
+        for cit, rid in xref_map.items()
+        if "et al." in cit
+    }
+    xref_map.update(italic_variants)
+    text_xref_fn = build_text_xref_replacer(doc)
+
+    buf = io.BytesIO()
+    doc.save(buf)
+    buf.seek(0)
+    marked_name = os.path.splitext(os.path.basename(article_docx.file.name))[0] + "_marked.docx"
+    article_docx.marked_file.save(marked_name, ContentFile(buf.read()), save=False)
+
     sections, content = functionsDocx().extractContent(doc, article_docx.file.path)
     article_docx_markup = article_docx
     text_title = ""
@@ -361,6 +434,31 @@ def get_labels(article_id, user_id):
 
         stream_data_back.extend(process_references(num_refs, output))
 
+    # data_front is never iterated inside get_xml — rescue any <p> items that the
+    # state machine left in stream_data (body paragraphs misclassified as front
+    # because their section headings use named Word styles with font_size=0).
+    rescued = [item for item in stream_data if item.get('value', {}).get('label') == '<p>']
+    if rescued:
+        stream_data_body = rescued + stream_data_body
+        stream_data = [item for item in stream_data if item not in rescued]
+
+    # Apply xref_map (DOCX hyperlinks) and narrative Author (year) xrefs to body.
+    for item in stream_data_body:
+        if item.get('value', {}).get('label') == '<p>':
+            para = item['value'].get('paragraph', '') or ''
+            if not para:
+                continue
+            # 1. Dict-based from DOCX hyperlinks
+            if xref_map:
+                for cit_text, rid in sorted(xref_map.items(), key=lambda x: -len(x[0])):
+                    para = para.replace(
+                        cit_text,
+                        f'<xref ref-type="bibr" rid="{rid}">{cit_text}</xref>',
+                    )
+            # 2. Narrative "Author (year)" citations
+            para = text_xref_fn(para)
+            item['value']['paragraph'] = para
+
     article_docx_markup.content = stream_data
     article_docx_markup.content_body = stream_data_body
     article_docx_markup.content_back = stream_data_back
@@ -368,7 +466,7 @@ def get_labels(article_id, user_id):
     article_docx_markup.save()
 
     xml, stream_data_body = get_xml(
-        article_docx, stream_data, stream_data_body, stream_data_back
+        article_docx, stream_data, stream_data_body, stream_data_back, xref_map=xref_map
     )
     persist_article_xml(article_docx_markup, xml, stream_data_body)
 
diff --git a/markup_doc/views.py b/markup_doc/views.py
index 9c002da..ac2cfe4 100644
--- a/markup_doc/views.py
+++ b/markup_doc/views.py
@@ -1,5 +1,6 @@
-from django.shortcuts import render
-from django.http import HttpResponse, HttpResponseBadRequest, Http404
+from django.contrib import messages
+from django.shortcuts import render, redirect
+from django.http import FileResponse, HttpResponse, HttpResponseBadRequest, Http404
 from .models import ArticleDocxMarkup
 #from .xml import extraer_citas_apa
 from django.http import JsonResponse
@@ -53,6 +54,38 @@ def generate_xml(request, id_registro):
         return HttpResponse(f"Error al generar el XML: {str(e)}", status=500)
 
 
+def download_marked_docx(request, pk):
+    try:
+        registro = ArticleDocxMarkup.objects.get(pk=pk)
+        if not registro.marked_file:
+            raise Http404("Arquivo marcado não disponível")
+        filename = os.path.basename(registro.marked_file.name)
+        return FileResponse(
+            registro.marked_file.open("rb"),
+            as_attachment=True,
+            filename=filename,
+        )
+    except ArticleDocxMarkup.DoesNotExist:
+        raise Http404("Registro não encontrado")
+
+
+def reprocess(request, pk):
+    from markup_doc.tasks import get_labels
+    from markup_doc.models import ProcessStatus
+    try:
+        registro = ArticleDocxMarkup.objects.get(pk=pk)
+        if not registro.file or not registro.file.name:
+            messages.error(request, "Arquivo DOCX original não encontrado.")
+            return redirect(request.META.get("HTTP_REFERER", "/admin/"))
+        registro.estatus = ProcessStatus.PROCESSING
+        registro.save(update_fields=["estatus"])
+        get_labels.delay(registro.pk, request.user.id)
+        messages.success(request, f'Reprocessamento iniciado para "{registro.title}".')
+    except ArticleDocxMarkup.DoesNotExist:
+        messages.error(request, "Registro não encontrado.")
+    return redirect(request.META.get("HTTP_REFERER", "/admin/"))
+
+
 def extract_citation(request):
 
     if request.method == "POST":
diff --git a/markup_doc/wagtail_hooks.py b/markup_doc/wagtail_hooks.py
index 4eb729b..bf2f9f7 100644
--- a/markup_doc/wagtail_hooks.py
+++ b/markup_doc/wagtail_hooks.py
@@ -22,6 +22,7 @@
     CollectionModel,
     JournalModel,
     MarkupXML,
+    ProcessedDocx,
     ProcessStatus,
     UploadDocx,
 )
@@ -40,6 +41,12 @@ def register_admin_urls():
         path(
             "download-xml/<int:id_registro>/", views.generate_xml, name="generate_xml"
         ),
+        path(
+            "download-marked-docx/<int:pk>/",
+            views.download_marked_docx,
+            name="download_marked_docx",
+        ),
+        path("reprocess/<int:pk>/", views.reprocess, name="reprocess"),
         path("extract-citation/", views.extract_citation, name="extract_citation"),
         path("get_journal/", views.get_journal, name="get_journal"),
         path("download-zip/", views.generate_zip, name="generate_zip"),
@@ -184,6 +191,18 @@ def index_view(self, request):
         return response
 
 
+class ProcessedDocxViewSet(SnippetViewSet):
+    model = ProcessedDocx
+    menu_label = _("DOCX processado")
+    menu_icon = "doc-full-inverse"
+    add_to_admin_menu = False
+    exclude_from_explorer = False
+    list_per_page = 20
+    list_display = ("title", "get_estatus_display", "get_marked_file_status")
+    search_fields = ("title",)
+    list_filter = ("estatus",)
+
+
 class XMLSPSSnippetViewSetGroup(SnippetViewSetGroup):
     menu_name = "xml_sps"
     menu_label = _("XML SPS")
@@ -214,6 +233,7 @@ class MarkupSnippetViewSetGroup(SnippetViewSetGroup):
     menu_order = get_menu_order("markup_doc")
     items = (
         UploadDocxViewSet,
+        ProcessedDocxViewSet,
         XMLSPSSnippetViewSetGroup,
     )
 
diff --git a/markup_doc/xml.py b/markup_doc/xml.py
index 3698ca9..e94ccf8 100644
--- a/markup_doc/xml.py
+++ b/markup_doc/xml.py
@@ -16,6 +16,45 @@
     proccess_special_content,
     sanitize_inline_xml_fragment,
 )
+from markup_doc.xref import make_text_xref_fn_from_refs
+
+_XREF_SPLIT_RE = re.compile(r'(<xref[^>]*>.*?</xref>)', re.DOTALL)
+
+
+def _apply_to_segments(text, fn):
+    """Apply fn only to plain-text segments, leaving existing <xref> tags intact."""
+    parts = _XREF_SPLIT_RE.split(text)
+    return ''.join(fn(part) if i % 2 == 0 else part for i, part in enumerate(parts))
+
+
+def _apply_xref_map(paragraph, xref_map):
+    """Apply xref_map replacements segment-by-segment to avoid double-wrapping."""
+    def replace_in_segment(seg):
+        for cit_text, rid in sorted(xref_map.items(), key=lambda x: -len(x[0])):
+            seg = seg.replace(
+                cit_text,
+                f'<xref ref-type="bibr" rid="{rid}">{cit_text}</xref>',
+            )
+        return seg
+    return _apply_to_segments(paragraph, replace_in_segment)
+
+
+def _apply_proccess_labeled_text(paragraph, data_back):
+    """Apply proccess_labeled_text to each plain-text segment independently."""
+    def process_segment(seg):
+        if not seg:
+            return seg
+        refs = proccess_labeled_text(seg, data_back)
+        for r in refs:
+            if r.get('refid') and not re.search(
+                rf'<xref[^>]*>{re.escape(r["cita"])}</xref>', seg
+            ):
+                seg = seg.replace(
+                    r['cita'],
+                    f'<xref ref-type="bibr" rid="{r["refid"]}">{r["cita"]}</xref>',
+                )
+        return seg
+    return _apply_to_segments(paragraph, process_segment)
 
 
 def extract_date(texto):
@@ -42,7 +81,18 @@ def extract_date(texto):
     return None  # No se encontró
 
 
-def get_xml(article_docx, data_front, data, data_back):
+def get_xml(article_docx, data_front, data, data_back, xref_map=None):
+    # Build narrative Author (year) xref replacer from data_back reference texts
+    _text_xref_refs = [
+        {
+            'rid': item['value'].get('refid') or f'B{i + 1}',
+            'ref_text': item['value'].get('paragraph') or '',
+        }
+        for i, item in enumerate(data_back)
+        if item.get('value')
+    ]
+    _text_xref_fn = make_text_xref_fn_from_refs(_text_xref_refs)
+
     # Crear el elemento raíz
     nsmap = {
         "mml": "http://www.w3.org/1998/Math/MathML",
@@ -422,10 +472,12 @@ def get_xml(article_docx, data_front, data, data_back):
 
         node_tmp = etree.SubElement(node, "abstract")
 
-        if vals:
+        if vals and vals[0]:
             node_tmp2 = etree.SubElement(node_tmp, "title")
             append_fragment(node_tmp2, vals[0].value.get("paragraph"))
 
+        if vals2 and vals2[0]:
+
         if vals2:
             # Encuentra su índice original en article_docx.content
             last_index = data_t.index(vals2[0])
@@ -622,6 +674,7 @@ def get_xml(article_docx, data_front, data, data_back):
             node_table_text = re.sub(r"\s*\n\s*", "", node_table_text).replace(
                 "<br>", ""
             )
+            node_table_text = re.sub(r"&(?!\w+;|#\d+;)", "&amp;", node_table_text)
 
             tabla_element = parse_xml_fragment(node_table_text)
 
@@ -716,31 +769,21 @@ def get_xml(article_docx, data_front, data, data_back):
             else:
                 node_p = etree.SubElement(node, "p")
 
-            # refs = extraer_citas_apa(d['value']['paragraph'].replace('[style name="italic"]', '').replace('[/style]', ''), data_back)
-            # refs = extraer_citas_apa(d['value']['paragraph'].replace('<italic>', '').replace('</italic>', ''), data_back)
-            if "xref" not in d["value"]["paragraph"]:
-                refs = proccess_labeled_text(d["value"]["paragraph"], data_back)
-                for r in refs:
-                    # print(f"r in refs: {r}")
-                    d["value"]["paragraph"] = d["value"]["paragraph"].replace(
-                        r["cita"],
-                        f"<xref ref-type=\"bibr\" rid=\"{r['refid']}\">{r['cita']}</xref>",
-                    )
-                    """
-                    if 'et al' in r['cita']:
-                        et_al_replace = r['cita'].replace('et al', '<italic>et al</italic>')
-                        d['value']['paragraph'] = d['value']['paragraph'].replace(et_al_replace, f"<xref reftype=\"bibr\" rid=\"{r['refid']}\">{et_al_replace}</xref>")
-                    else:
-                        #print(r['cita'])
-                        d['value']['paragraph'] = d['value']['paragraph'].replace(r['cita'], f"<xref reftype=\"bibr\" rid=\"{r['refid']}\">{r['cita']}</xref>")
-                    """
-
-                elements = proccess_special_content(d["value"]["paragraph"], data)
-                for e in elements:
-                    d["value"]["paragraph"] = d["value"]["paragraph"].replace(
-                        e["label"],
-                        f"<xref ref-type=\"{e['reftype']}\" rid=\"{e['id']}\">{e['label']}</xref>",
-                    )
+            # Apply all xref passes to every paragraph, operating segment-by-segment
+            # so that citations already marked by tasks.py pre-processing are not
+            # double-wrapped, and citations in the same paragraph that were missed
+            # still get processed.
+            if xref_map:
+                d["value"]["paragraph"] = _apply_xref_map(d["value"]["paragraph"], xref_map)
+            d["value"]["paragraph"] = _text_xref_fn(d["value"]["paragraph"])
+            d["value"]["paragraph"] = _apply_proccess_labeled_text(d["value"]["paragraph"], data_back)
+
+            elements = proccess_special_content(d["value"]["paragraph"], data)
+            for e in elements:
+                d["value"]["paragraph"] = d["value"]["paragraph"].replace(
+                    e["label"],
+                    f"<xref ref-type=\"{e['reftype']}\" rid=\"{e['id']}\">{e['label']}</xref>",
+                )
 
             append_fragment(node_p, d["value"]["paragraph"])
 
@@ -808,19 +851,16 @@ def get_xml(article_docx, data_front, data, data_back):
             append_fragment(node_tit, d["value"]["paragraph"])
         if d["value"]["label"] == "<p>":
             values = d["value"]
-            node_ref = etree.SubElement(
-                node_reflist, "ref", attrib={"id": values["refid"]}
-            )
-            # node_label = etree.SubElement(node_ref, 'label')
-            # append_fragment(node_label, values['refid'].replace('B', ''))
+            refid = values.get("refid") or f"B{i + 1}"
+            node_ref = etree.SubElement(node_reflist, "ref", attrib={"id": refid})
             node_mix = etree.SubElement(node_ref, "mixed-citation")
             append_fragment(node_mix, values["paragraph"])
 
-            if values["reftype"] == "journal":
+            if values.get("reftype") == "journal":
                 node_elem = etree.SubElement(
                     node_ref,
                     "element-citation",
-                    attrib={"publication-type": values["reftype"]},
+                    attrib={"publication-type": values.get("reftype")},
                 )
                 node_person = etree.SubElement(
                     node_elem, "person-group", attrib={"person-group-type": "author"}
@@ -874,7 +914,7 @@ def get_xml(article_docx, data_front, data, data_back):
                         values["uri"],
                     )
 
-            if values["reftype"] == "book":
+            if values.get("reftype") == "book":
                 node_elem = etree.SubElement(
                     node_ref,
                     "element-citation",
@@ -911,11 +951,11 @@ def get_xml(article_docx, data_front, data, data_back):
                     etree.SubElement(node_ref, "lpage"), str(values["lpage"])
                 )
 
-            if values["reftype"] == "data":
+            if values.get("reftype") == "data":
                 node_elem = etree.SubElement(
                     node_ref,
                     "element-citation",
-                    attrib={"publication-type": values["reftype"]},
+                    attrib={"publication-type": values.get("reftype")},
                 )
                 node_person = etree.SubElement(
                     node_elem, "person-group", attrib={"person-group-type": "author"}
@@ -952,11 +992,11 @@ def get_xml(article_docx, data_front, data, data_back):
                         values["uri"],
                     )
 
-            if values["reftype"] == "webpage":
+            if values.get("reftype") == "webpage":
                 node_elem = etree.SubElement(
                     node_ref,
                     "element-citation",
-                    attrib={"publication-type": values["reftype"]},
+                    attrib={"publication-type": values.get("reftype")},
                 )
                 node_person = etree.SubElement(
                     node_elem, "person-group", attrib={"person-group-type": "author"}
@@ -985,11 +1025,11 @@ def get_xml(article_docx, data_front, data, data_back):
                     etree.SubElement(node_ref, "access-date"), values["access_date"]
                 )
 
-            if values["reftype"] == "confproc":
+            if values.get("reftype") == "confproc":
                 node_elem = etree.SubElement(
                     node_ref,
                     "element-citation",
-                    attrib={"publication-type": values["reftype"]},
+                    attrib={"publication-type": values.get("reftype")},
                 )
                 node_person = etree.SubElement(
                     node_elem, "person-group", attrib={"person-group-type": "author"}
diff --git a/markup_doc/xref.py b/markup_doc/xref.py
new file mode 100644
index 0000000..ae0c5eb
--- /dev/null
+++ b/markup_doc/xref.py
@@ -0,0 +1,840 @@
+"""
+Cross-reference (xref) linking for the DOCX → SPS XML pipeline.
+
+Official convention
+-------------------
+- Each reference entry in the reference list receives a bookmark named
+  ``xref_B{n}`` (1-indexed, n = position in the reference list).
+- Each in-text citation becomes a Word internal hyperlink whose anchor
+  points to the corresponding ``xref_B{n}`` bookmark.
+
+This convention allows:
+- Clicking a citation in Word → jumps to the reference entry.
+- Clicking the reference entry bookmark → jumps back (if a reverse
+  hyperlink is added by the editor).
+
+Supported citation styles (auto-detected for unmarked documents):
+- ABNT        : (Autor, 2020)  or  (Autor et al., 2020)
+- Vancouver bracket    : [1]  or  [7,8]  or  [3-5]
+- Vancouver superscript: runs with font.superscript == True containing digits
+
+Validation rules:
+- ERROR   : a hyperlink points to a bookmark that does not exist.
+- WARNING : a bookmark has no corresponding hyperlink (uncited reference).
+"""
+
+import copy
+import re
+import unicodedata
+
+from docx import Document
+from docx.oxml import OxmlElement
+from docx.oxml.ns import qn
+
+BOOKMARK_PREFIX = "xref_B"
+
+_REF_HEADINGS = {
+    "references",
+    "referências",
+    "referências bibliográficas",
+    "referencias",
+    "referencias bibliográficas",
+    "bibliography",
+    "bibliografia",
+}
+
+_STOP_HEADINGS = {
+    "figures captions",
+    "figure captions",
+    "figures",
+    "supplementary material",
+    "supplementary materials",
+    "appendix",
+    "appendices",
+    "supporting information",
+    "acknowledgements",
+    "acknowledgments",
+    "agradecimentos",
+    "material suplementar",
+    "notas",
+    "notes",
+    # author/editor metadata sections
+    "author contributions",
+    "contribuições dos autores",
+    "contribuciones de los autores",
+    "data availability",
+    "data availability statement",
+    "disponibilidade dos dados",
+    "funding",
+    "financiamento",
+    "conflict of interest",
+    "conflicts of interest",
+    "conflito de interesses",
+    "declaration of competing interest",
+    "editors",
+    "editor associado",
+    "editor científico",
+    "associate editor",
+    "scientific editor",
+}
+
+_ALLCAPS_STOP_RE = re.compile(r'^[A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒ\s\-]{4,60}$')
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+def is_marked(doc: Document) -> bool:
+    """Return True if *doc* contains xref_B* bookmarks AND hyperlinks."""
+    xml = doc.element.xml
+    has_bk = bool(re.search(rf'w:name="{BOOKMARK_PREFIX}\d+"', xml))
+    has_hl = bool(re.search(rf'w:anchor="{BOOKMARK_PREFIX}\d+"', xml))
+    return has_bk and has_hl
+
+
+def validate_marks(doc: Document) -> dict:
+    """
+    Validate consistency of xref markup.
+
+    Returns a dict::
+
+        {
+            "valid": bool,               # False when any hyperlink is orphaned
+            "bookmarks": set[str],       # all xref_B* bookmarks found
+            "hyperlinks": set[str],      # all xref_B* anchors found
+            "orphaned_bookmarks": list,  # bookmarks without a citation (warnings)
+            "orphaned_hyperlinks": list, # citations without a reference (errors)
+            "warnings": list[str],
+            "errors": list[str],
+        }
+    """
+    xml = doc.element.xml
+    bookmarks = set(re.findall(rf'w:name="({BOOKMARK_PREFIX}\d+)"', xml))
+    hyperlinks = set(re.findall(rf'w:anchor="({BOOKMARK_PREFIX}\d+)"', xml))
+
+    orphaned_bk = sorted(bookmarks - hyperlinks)
+    orphaned_hl = sorted(hyperlinks - bookmarks)
+
+    warnings = [f"Reference {b} has no in-text citation." for b in orphaned_bk]
+    errors = [f"Citation links to {h} but no matching reference bookmark found." for h in orphaned_hl]
+
+    return {
+        "valid": len(orphaned_hl) == 0,
+        "bookmarks": bookmarks,
+        "hyperlinks": hyperlinks,
+        "orphaned_bookmarks": orphaned_bk,
+        "orphaned_hyperlinks": orphaned_hl,
+        "warnings": warnings,
+        "errors": errors,
+    }
+
+
+def read_marks(doc: Document) -> list:
+    """
+    Extract xref data from a marked document.
+
+    Returns a list of dicts (one per reference), ordered by bookmark index::
+
+        [
+            {
+                "rid": "B1",
+                "bookmark": "xref_B1",
+                "ref_text": "AUTOR, A. 2020. Título...",
+                "citations": ["(Autor, 2020)", ...],   # in-text citation texts
+            },
+            ...
+        ]
+    """
+    xml = doc.element.xml
+
+    # Collect all bookmark names present
+    bk_names = sorted(
+        set(re.findall(rf'w:name="({BOOKMARK_PREFIX}\d+)"', xml)),
+        key=lambda s: int(s[len(BOOKMARK_PREFIX):]),
+    )
+
+    # Map anchor → list of citation texts extracted from hyperlinks
+    citation_map: dict[str, list[str]] = {b: [] for b in bk_names}
+
+    # Scan ALL paragraphs (including those inside table cells)
+    for p_elem in doc.element.body.iter(qn("w:p")):
+        p_xml = p_elem.xml
+        for m in re.finditer(
+            rf'<w:hyperlink[^>]+w:anchor="({BOOKMARK_PREFIX}\d+)"[^>]*>(.*?)</w:hyperlink>',
+            p_xml,
+            re.DOTALL,
+        ):
+            anchor = m.group(1)
+            inner = m.group(2)
+            # Extract plain text from the hyperlink's runs and unescape XML entities
+            texts = re.findall(r'<w:t[^>]*>([^<]*)</w:t>', inner)
+            citation_text = "".join(texts).strip().replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">").replace("&quot;", '"').replace("&#39;", "'")
+            if anchor in citation_map:
+                citation_map[anchor].append(citation_text)
+
+    # Map bookmark → reference paragraph text
+    ref_paragraphs = _find_references_section(doc)
+    ref_text_map: dict[str, str] = {}
+    for idx, (_, para) in enumerate(ref_paragraphs, start=1):
+        bk = f"{BOOKMARK_PREFIX}{idx}"
+        ref_text_map[bk] = para.text.strip()
+
+    result = []
+    for bk in bk_names:
+        n = bk[len(BOOKMARK_PREFIX):]
+        result.append({
+            "rid": f"B{n}",
+            "bookmark": bk,
+            "ref_text": ref_text_map.get(bk, ""),
+            "citations": citation_map.get(bk, []),
+        })
+    return result
+
+
+def mark_references(doc: Document) -> Document:
+    """
+    Auto-detect citations and add xref markup to *doc*.
+
+    1. Adds ``xref_B{n}`` bookmarks to each reference entry.
+    2. Detects the citation style (ABNT, Vancouver bracket, superscript).
+    3. Wraps in-text citations in internal hyperlinks pointing to the
+       corresponding bookmark.
+
+    Returns the modified Document (same object, mutated in place).
+    """
+    refs = _find_references_section(doc)
+    if not refs:
+        return doc
+
+    # Step 1 — bookmark each reference
+    bk_id_start = _next_bookmark_id(doc)
+    for offset, (_, para) in enumerate(refs):
+        bk_name = f"{BOOKMARK_PREFIX}{offset + 1}"
+        _add_bookmark_to_para(para, bk_name, bk_id_start + offset)
+
+    # Build reference index for matching
+    ref_index = _build_ref_index(refs)
+
+    # Step 2 — detect style and find citations
+    style = _detect_citation_style(doc)
+
+    if style == "vancouver_bracket":
+        citations = _find_citations_bracket(doc)
+    elif style == "vancouver_superscript":
+        citations = _find_citations_superscript(doc)
+    else:
+        citations = _find_citations_abnt(doc, ref_index)
+
+    # Step 3 — insert hyperlinks
+    for para, spans in citations.items():
+        _insert_hyperlinks(para, spans)
+
+    return doc
+
+
+# ---------------------------------------------------------------------------
+# Detection helpers
+# ---------------------------------------------------------------------------
+
+def _detect_citation_style(doc: Document) -> str:
+    """Return 'abnt', 'vancouver_bracket', or 'vancouver_superscript'."""
+    body_paras = _body_paragraphs(doc)
+    full_text = " ".join(p.text for p in body_paras)
+
+    # Bracket citations [1] or [1,2] are the most unambiguous signal.
+    brackets = re.findall(r'\[\d+(?:[,\-]\d+)*\]', full_text)
+    if len(brackets) >= 3:
+        return "vancouver_bracket"
+
+    # Superscript digit runs — require a high count to avoid mistaking
+    # footnote markers or ordinals in ABNT documents.
+    sup_count = sum(
+        1
+        for para in body_paras
+        for run in para.runs
+        if run.font.superscript and re.fullmatch(r'[\d,\s\-]+', run.text.strip())
+    )
+    abnt_count = len(re.findall(
+        r'\([A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇ][^()]{2,80}\d{4}[^()]*\)', full_text
+    ))
+    # Declare superscript only when it clearly dominates over ABNT matches.
+    if sup_count >= 10 and sup_count > abnt_count * 3:
+        return "vancouver_superscript"
+
+    return "abnt"
+
+
+def _iter_all_paragraphs(doc: Document):
+    """Yield all Paragraph objects in document order, including inside tables."""
+    from docx.text.paragraph import Paragraph as _Para
+    for p_elem in doc.element.body.iter(qn("w:p")):
+        yield _Para(p_elem, doc)
+
+
+_METADATA_RE = re.compile(
+    r'^(?:received|accepted|published|available\s+at|doi\s*:|https?://)',
+    re.IGNORECASE,
+)
+
+_YEAR_RE_SIMPLE = re.compile(r'\b(?:1[89]|20)\d{2}\b')
+
+def _find_references_section(doc: Document) -> list:
+    """Return list of (paragraph_index, paragraph) for reference entries."""
+    in_refs = False
+    refs = []
+    for i, para in enumerate(_iter_all_paragraphs(doc)):
+        text = para.text.strip()
+        text_lower = text.lower()
+        if text_lower in _REF_HEADINGS:
+            in_refs = True
+            continue
+        if not in_refs:
+            continue
+        # Stop at known post-reference section headings
+        if text_lower in _STOP_HEADINGS:
+            break
+        # Stop at Word heading styles (Heading 1/2/3/...)
+        style_name = (para.style.name or '') if para.style else ''
+        if re.match(r'heading\s*\d', style_name, re.IGNORECASE):
+            break
+        # Stop at ALL-CAPS short paragraphs without a year — section headings
+        # like "CONTRIBUIÇÕES DOS AUTORES", "EDITOR ASSOCIADO", etc.
+        if (text and len(text) <= 60
+                and _ALLCAPS_STOP_RE.match(text)
+                and not _YEAR_RE_SIMPLE.search(text)):
+            break
+        if text and not _METADATA_RE.match(text):
+            refs.append((i, para))
+    return refs
+
+
+def _build_ref_index(refs: list) -> list:
+    """Return list of (n, first_author_normalized, year, para) for ABNT matching."""
+    index = []
+    year_re = re.compile(r'\b((?:1[89]|20)\d{2}[a-z]?)\b')
+    for n, (_, para) in enumerate(refs, start=1):
+        text = para.text.strip()
+        year_m = year_re.search(text)
+        year = year_m.group(1) if year_m else ""
+        first_author = _normalize(_first_surname(text))
+        index.append((n, first_author, year, para))
+    return index
+
+
+def _first_surname(ref_text: str) -> str:
+    """Extract the first author surname from a reference string."""
+    # ABNT: SOBRENOME, Iniciais. → first word before comma
+    # Vancouver: Sobrenome AB, ... → first word
+    m = re.match(r'^([A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒ][A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒa-záéíóúàâêôãõüçäöïëøåæœ\-]+)', ref_text.strip())
+    return m.group(1) if m else ref_text[:10]
+
+
+def _normalize(text: str) -> str:
+    """Lowercase + remove accents for fuzzy comparison."""
+    nfkd = unicodedata.normalize("NFKD", text)
+    return "".join(c for c in nfkd if not unicodedata.combining(c)).lower()
+
+
+def _body_paragraphs(doc: Document) -> list:
+    """Return paragraphs that belong to the article body (before references)."""
+    body = []
+    for para in _iter_all_paragraphs(doc):
+        if para.text.strip().lower() in _REF_HEADINGS:
+            break
+        body.append(para)
+    return body
+
+
+# ---------------------------------------------------------------------------
+# Citation finders — return {para: [(start, end, anchor), ...]}
+# ---------------------------------------------------------------------------
+
+def _find_citations_bracket(doc: Document) -> dict:
+    """Find [n] and [n,m] citations and map them to xref_B* anchors."""
+    result: dict = {}
+    pattern = re.compile(r'\[(\d+(?:[,\-]\d+)*)\]')
+
+    for para in _body_paragraphs(doc):
+        text = para.text
+        spans = []
+        for m in pattern.finditer(text):
+            numbers = _expand_range(m.group(1))
+            for n in numbers:
+                anchor = f"{BOOKMARK_PREFIX}{n}"
+                spans.append((m.start(), m.end(), anchor, m.group(0)))
+        if spans:
+            result[para] = spans
+    return result
+
+
+def _find_citations_superscript(doc: Document) -> dict:
+    """Find superscript-number citations and map them to xref_B* anchors."""
+    result: dict = {}
+
+    for para in _body_paragraphs(doc):
+        spans = []
+        pos = 0
+        for run in para.runs:
+            run_text = run.text
+            run_end = pos + len(run_text)
+            if run.font.superscript and re.fullmatch(r'[\d,\s\-]+', run_text.strip()):
+                # Strip leading/trailing commas that Word sometimes includes
+                # in the same superscript run as punctuation separators.
+                clean = run_text.strip().strip(',').strip()
+                numbers = _expand_range(clean.replace(" ", ""))
+                for n in numbers:
+                    anchor = f"{BOOKMARK_PREFIX}{n}"
+                    spans.append((pos, run_end, anchor, clean))
+            pos = run_end
+        if spans:
+            result[para] = spans
+    return result
+
+
+def _find_citations_abnt(doc: Document, ref_index: list) -> dict:
+    """
+    Find ABNT citations in both forms and match against ref_index:
+      - Parenthetical: (Author, 2020) or (Author et al., 2020; Author2, 2021)
+      - Narrative:     Author (2020) or Author et al. (2020) or Author and Author (2020)
+    """
+    result: dict = {}
+    paren_re = re.compile(
+        r'\(([A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇ][^\(\)]{2,100}\d{4}[^\(\)]*)\)',
+        re.UNICODE,
+    )
+    year_re = re.compile(r'\b(1[89]\d{2}|20\d{2})\b')
+
+    # Surname token: handles hyphen-compounds with optional space (e.g. "Ilkiu-Borges" or "Ilkiu -Borges")
+    _sname = (
+        r'[A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒ][A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒa-záéíóúàâêôãõüçäöïëøåæœ]+'
+        r'(?:\s*-\s*[A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒa-záéíóúàâêôãõüçäöïëøåæœ]+)*'
+    )
+    narrative_re = re.compile(
+        r'(' + _sname + r'(?:\s+(?:and|&)\s+' + _sname + r')*(?:\s+et\s+al\.)?)'
+        r'\s*\((\d{4}[a-z]?(?:,\s*\d{4}[a-z]?)*)\)',
+        re.UNICODE,
+    )
+
+    for para in _body_paragraphs(doc):
+        text = para.text
+        spans = []
+        covered: set[tuple[int, int]] = set()
+
+        # 1. Parenthetical citations: (Author, year) — split on ";" for multiple
+        for m in paren_re.finditer(text):
+            inner = m.group(1)
+            parts = [p.strip() for p in inner.split(";")]
+            for part in parts:
+                year_m = year_re.search(part)
+                if not year_m:
+                    continue
+                year = year_m.group(1)
+                surname_m = re.match(r'([A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇ][^\s,]+)', part)
+                if not surname_m:
+                    continue
+                surname = _normalize(surname_m.group(1))
+                anchor = _match_abnt(surname, year, ref_index)
+                if anchor and (m.start(), m.end()) not in covered:
+                    spans.append((m.start(), m.end(), anchor, m.group(0)))
+                    covered.add((m.start(), m.end()))
+
+        # 2. Narrative citations: Author (year) — not already covered by parenthetical
+        for m in narrative_re.finditer(text):
+            if (m.start(), m.end()) in covered:
+                continue
+            author_part = m.group(1).strip()
+            years_str = m.group(2)
+            # Extract first surname (strip et al. first)
+            author_clean = re.sub(r'\s+et\s+al\.', '', author_part)
+            first_token = re.match(r'([^\s]+)', author_clean)
+            if not first_token:
+                continue
+            surname = _normalize(first_token.group(1))
+            # Try each year in the citation until one matches (handles "Author (1976, 1984, 1985)")
+            anchor = None
+            for yr in re.findall(r'\d{4}[a-z]?', years_str):
+                anchor = _match_abnt(surname, yr, ref_index)
+                if anchor:
+                    break
+            if anchor and (m.start(), m.end()) not in covered:
+                spans.append((m.start(), m.end(), anchor, m.group(0)))
+                covered.add((m.start(), m.end()))
+
+        if spans:
+            result[para] = spans
+    return result
+
+
+def _match_abnt(surname: str, year: str, ref_index: list) -> str | None:
+    """Return xref_Bn for the best match, or None."""
+    skey = surname[:5]
+    year_plain = year[:4]
+    # Exact match first (preserves 2004a vs 2004b disambiguation)
+    for n, first_author, ref_year, _ in ref_index:
+        if ref_year == year and first_author.startswith(skey):
+            return f"{BOOKMARK_PREFIX}{n}"
+    # Fallback: compare first 4 chars (handles refs stored without suffix)
+    for n, first_author, ref_year, _ in ref_index:
+        if ref_year[:4] == year_plain and first_author.startswith(skey):
+            return f"{BOOKMARK_PREFIX}{n}"
+    return None
+
+
+def _expand_range(token: str) -> list[int]:
+    """'3,5' → [3,5];  '7-9' → [7,8,9];  '2' → [2]."""
+    numbers = []
+    for part in token.split(","):
+        part = part.strip()
+        if "-" in part:
+            a, b = part.split("-", 1)
+            try:
+                numbers.extend(range(int(a), int(b) + 1))
+            except ValueError:
+                pass
+        else:
+            try:
+                numbers.append(int(part))
+            except ValueError:
+                pass
+    return numbers
+
+
+# ---------------------------------------------------------------------------
+# XML manipulation
+# ---------------------------------------------------------------------------
+
+def _next_bookmark_id(doc: Document) -> int:
+    """Return an id value safe to use for new bookmarks."""
+    existing = re.findall(r'w:id="(\d+)"', doc.element.xml)
+    return max((int(i) for i in existing), default=0) + 1
+
+
+def _add_bookmark_to_para(para, name: str, bk_id: int):
+    """Wrap the paragraph content in a named bookmark."""
+    p = para._p
+
+    bk_start = OxmlElement("w:bookmarkStart")
+    bk_start.set(qn("w:id"), str(bk_id))
+    bk_start.set(qn("w:name"), name)
+
+    bk_end = OxmlElement("w:bookmarkEnd")
+    bk_end.set(qn("w:id"), str(bk_id))
+
+    p.insert(0, bk_start)
+    p.append(bk_end)
+
+
+def _insert_hyperlinks(para, spans: list):
+    """
+    Replace citation text in *para* with internal hyperlinks.
+
+    *spans* is a list of (start, end, anchor, original_text) tuples, where
+    start/end are character offsets in ``para.text``.
+    Multiple citations pointing to the same span are merged into separate
+    hyperlinks inserted consecutively.
+    """
+    # Deduplicate spans on (start, end) keeping first match only
+    seen = set()
+    unique_spans = []
+    for span in sorted(spans, key=lambda s: s[0]):
+        key = (span[0], span[1])
+        if key not in seen:
+            seen.add(key)
+            unique_spans.append(span)
+
+    # Rebuild paragraph XML run-by-run, inserting hyperlinks at citation positions
+    p = para._p
+    full_text = para.text
+
+    # Collect (run_element, run_start, run_end) from current runs
+    run_segments = []
+    pos = 0
+    for child in p:
+        tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
+        if tag == "r":
+            t_elem = child.find(qn("w:t"))
+            text = t_elem.text if t_elem is not None and t_elem.text else ""
+            run_segments.append((child, pos, pos + len(text)))
+            pos += len(text)
+        elif tag == "hyperlink":
+            # Already a hyperlink — count its text length
+            inner_text = "".join(
+                (t.text or "") for t in child.iter(qn("w:t"))
+            )
+            run_segments.append((child, pos, pos + len(inner_text)))
+            pos += len(inner_text)
+
+    if not run_segments:
+        return
+
+    # Build list of "what goes where" in character-offset order
+    # Each item: ('run', elem) or ('hyperlink', anchor, text, template_run)
+    events = []  # (char_offset, type, ...)
+
+    # Mark citation zones
+    citation_zones = {(s, e): (anchor, txt) for s, e, anchor, txt in unique_spans}
+
+    offset = 0
+    seg_idx = 0
+    while seg_idx < len(run_segments) and offset < len(full_text):
+        elem, seg_start, seg_end = run_segments[seg_idx]
+        tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
+
+        # Check if a citation zone starts here
+        matched_zone = None
+        for (z_start, z_end), (anchor, cit_text) in citation_zones.items():
+            if seg_start <= z_start < seg_end or (z_start <= seg_start < z_end):
+                matched_zone = (z_start, z_end, anchor, cit_text)
+                break
+
+        if matched_zone is None or tag == "hyperlink":
+            events.append(("keep", elem))
+            seg_idx += 1
+            continue
+
+        z_start, z_end, anchor, cit_text = matched_zone
+
+        # Split first run: extract text before the citation starts
+        t_node = elem.find(qn("w:t"))
+        run_text = t_node.text if t_node is not None and t_node.text else ""
+        before = run_text[:max(0, z_start - seg_start)]
+        if before:
+            r_before = copy.deepcopy(elem)
+            r_before.find(qn("w:t")).text = before
+            events.append(("keep", r_before))
+
+        # Advance seg_idx to the last run that overlaps this zone.
+        # Citations like "Costa <italic>et al.</italic> (2020)" span multiple runs;
+        # without this loop only the first run portion would be hyperlinked.
+        while seg_idx + 1 < len(run_segments) and run_segments[seg_idx + 1][1] < z_end:
+            seg_idx += 1
+
+        # Extract text after the citation ends from the last run in the zone
+        last_elem, last_start, _ = run_segments[seg_idx]
+        last_t = last_elem.find(qn("w:t"))
+        last_text = last_t.text if last_t is not None and last_t.text else ""
+        after = last_text[max(0, z_end - last_start):]
+
+        # Emit hyperlink with full citation text (first run used as style template)
+        events.append(("hyperlink", anchor, cit_text, elem))
+
+        if after:
+            r_after = copy.deepcopy(last_elem)
+            t = r_after.find(qn("w:t"))
+            if t is not None:
+                t.text = after
+                if after.startswith(" ") or after.endswith(" "):
+                    t.set("{http://www.w3.org/XML/1998/namespace}space", "preserve")
+            events.append(("keep", r_after))
+
+        del citation_zones[(z_start, z_end)]
+        seg_idx += 1
+
+    # Remaining segments
+    for elem, _, _ in run_segments[seg_idx:]:
+        events.append(("keep", elem))
+
+    # Remove old run/hyperlink children from paragraph
+    for elem, _, _ in run_segments:
+        if elem in p:
+            p.remove(elem)
+
+    # Re-insert in order
+    insert_pos = 0
+    # Find insertion point (after pPr if present)
+    ppr = p.find(qn("w:pPr"))
+    insert_after = ppr if ppr is not None else None
+
+    for event in events:
+        if event[0] == "keep":
+            elem = event[1]
+            if insert_after is not None:
+                insert_after.addnext(elem)
+                insert_after = elem
+            else:
+                p.insert(insert_pos, elem)
+                insert_pos += 1
+        else:
+            _, anchor, cit_text, template_run = event
+            hl = _make_hyperlink(anchor, cit_text, template_run)
+            if insert_after is not None:
+                insert_after.addnext(hl)
+                insert_after = hl
+            else:
+                p.insert(insert_pos, hl)
+                insert_pos += 1
+
+
+def build_text_xref_replacer(doc: Document):
+    """
+    Build a callable that tags 'Author (year)' narrative citations with <xref>.
+    Builds the reference lookup directly from the reference list section in *doc*,
+    assigning B1..Bn by position (consistent with read_marks / xml.py convention).
+    Returns: apply(text: str) -> str
+    """
+    refs = _find_references_section(doc)
+    ref_list = [
+        {'rid': f'B{i + 1}', 'ref_text': para.text.strip()}
+        for i, (_, para) in enumerate(refs)
+    ]
+    return _make_text_xref_fn(ref_list)
+
+
+def make_text_xref_fn_from_refs(ref_items: list):
+    """
+    Build a narrative xref replacer from reference dicts with keys:
+    {'rid'|'refid': 'Bn', 'ref_text'|'paragraph': '...'}.
+    Returns: apply(text: str) -> str
+    """
+    return _make_text_xref_fn(ref_items)
+
+
+def _make_text_xref_fn(ref_list: list):
+    """Build the 'Author (year)' replacer function from a list of reference dicts."""
+    # Year regex includes optional letter suffix (e.g. 2004a, 2004b)
+    _year_re = re.compile(r'\b((?:1[89]|20)\d{2}[a-z]?)\b')
+    # Tuples: (skey, year_with_suffix, rid, full_ref_text_normalized) for compound-author lookup
+    ref_entries: list[tuple[str, str, str, str]] = []
+    # Simple primary lookup: first match wins
+    ref_lookup: dict[tuple[str, str], str] = {}
+
+    for i, item in enumerate(ref_list):
+        rid = item.get('rid') or item.get('refid') or f'B{i + 1}'
+        text = item.get('ref_text') or item.get('paragraph') or ''
+        if not text:
+            continue
+        skey = _normalize(_first_surname(text))[:5]
+        norm_text = _normalize(text)
+        for year in _year_re.findall(text)[:4]:
+            ref_entries.append((skey, year, rid, norm_text))
+            if (skey, year) not in ref_lookup:
+                ref_lookup[(skey, year)] = rid
+
+    if not ref_entries:
+        return lambda t: t
+
+    def _lookup(skey: str, year: str, extra_skeys: list[str]) -> str | None:
+        """Find best rid: prefer entries containing all author surnames."""
+        candidates = [(rid, norm) for s, y, rid, norm in ref_entries if s == skey and y == year]
+        if not candidates:
+            return None
+        if len(candidates) == 1 or not extra_skeys:
+            return candidates[0][0]
+        # Prefer candidate whose text contains the extra authors
+        for rid, norm in candidates:
+            if all(sk in norm for sk in extra_skeys):
+                return rid
+        return candidates[0][0]
+
+    # Reusable surname token: handles "Ilkiu-Borges" and "Ilkiu -Borges" (space before hyphen)
+    _sname = (
+        r'[A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒ][A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒa-záéíóúàâêôãõüçäöïëøåæœ]+'
+        r'(?:\s*-\s*[A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒa-záéíóúàâêôãõüçäöïëøåæœ]+)*'
+    )
+    # et al. can appear as plain text or wrapped in <italic> tags
+    _etal = r'(?:\s+(?:et\s+al\.|<italic>et\s+al\.?</italic>\.?))?'
+    # Match: Surname [and/& Surname]* [et al.] (year[a-z]?[, year[a-z]?]*)
+    _narrative_re = re.compile(
+        r'(' + _sname + r'(?:\s+(?:and|&)\s+' + _sname + r')*' + _etal + r')'
+        r'\s*\((\d{4}[a-z]?(?:,\s*\d{4}[a-z]?)*)\)',
+        re.UNICODE,
+    )
+    _split_re = re.compile(r'(<xref[^>]*>.*?</xref>)', re.DOTALL)
+    _etal_strip = re.compile(r'\s+(?:et\s+al\.|<italic>et\s+al\.?</italic>\.?)')
+
+    def _replace(m: re.Match) -> str:
+        full = m.group(0)
+        author_part = m.group(1).strip()
+        years_str = m.group(2)
+        # Remove et al., then split on and/& to get individual author tokens
+        author_clean = _etal_strip.sub('', author_part)
+        author_tokens = re.split(r'\s+(?:and|&)\s+', author_clean)
+        skeys = [_normalize(t.split()[0])[:5] for t in author_tokens if t.strip()]
+        if not skeys:
+            return full
+        primary_skey = skeys[0]
+        extra_skeys = skeys[1:]
+        rids: list[str] = []
+        for year in re.findall(r'\d{4}[a-z]?', years_str):
+            rid = _lookup(primary_skey, year, extra_skeys)
+            if rid and rid not in rids:
+                rids.append(rid)
+        if not rids:
+            return full
+        return f'<xref ref-type="bibr" rid="{" ".join(rids)}">{full}</xref>'
+
+    _paren_inner_re = re.compile(
+        r'\(([A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒ][^\(\)]{2,200}\d{4}[^\(\)]*)\)',
+        re.UNICODE,
+    )
+    _paren_year_re = re.compile(r'\b(1[89]\d{2}|20\d{2})\b')
+    _paren_author_re = re.compile(r'([A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒ][^\s,;]+)')
+
+    def _replace_paren(m: re.Match) -> str:
+        full = m.group(0)
+        inner = m.group(1)
+        parts = [p.strip() for p in re.split(r'[;,]\s*(?=[A-ZÁÉÍÓÚÀÂÊÔÃÕÜÇÄÖÏËØÅÆŒ])', inner)]
+        rids: list[str] = []
+        for part in parts:
+            yr_m = _paren_year_re.search(part)
+            if not yr_m:
+                continue
+            au_m = _paren_author_re.match(part)
+            if not au_m:
+                continue
+            skey = _normalize(au_m.group(1))[:5]
+            rid = _lookup(skey, yr_m.group(1), [])
+            if rid and rid not in rids:
+                rids.append(rid)
+        if not rids:
+            return full
+        return f'<xref ref-type="bibr" rid="{" ".join(rids)}">{full}</xref>'
+
+    def apply(text: str) -> str:
+        if not text:
+            return text
+        parts = _split_re.split(text)
+        result = []
+        for idx, part in enumerate(parts):
+            if idx % 2 != 0:
+                result.append(part)
+                continue
+            # Narrative first, then parenthetical on remaining non-xref text
+            part = _narrative_re.sub(_replace, part)
+            sub_parts = _split_re.split(part)
+            out = []
+            for i, sp in enumerate(sub_parts):
+                if i % 2 != 0:
+                    out.append(sp)
+                else:
+                    out.append(_paren_inner_re.sub(_replace_paren, sp))
+            result.append(''.join(out))
+        return ''.join(result)
+
+    return apply
+
+
+def _make_hyperlink(anchor: str, text: str, template_run) -> object:
+    """Create a <w:hyperlink w:anchor="..."> element."""
+    hl = OxmlElement("w:hyperlink")
+    hl.set(qn("w:anchor"), anchor)
+
+    r = copy.deepcopy(template_run)
+    # Ensure rPr exists and add Hyperlink style
+    rpr = r.find(qn("w:rPr"))
+    if rpr is None:
+        rpr = OxmlElement("w:rPr")
+        r.insert(0, rpr)
+    style_elem = OxmlElement("w:rStyle")
+    style_elem.set(qn("w:val"), "Hyperlink")
+    rpr.insert(0, style_elem)
+
+    t = r.find(qn("w:t"))
+    if t is not None:
+        t.text = text
+        if text.startswith(" ") or text.endswith(" "):
+            t.set("{http://www.w3.org/XML/1998/namespace}space", "preserve")
+
+    hl.append(r)
+    return hl
\ No newline at end of file

From d104670d3ebcd2b0429f20e71103ac82a476349a Mon Sep 17 00:00:00 2001
From: Rossi-Luciano <luciano.rossi.lucross@gmail.com>
Date: Sun, 31 May 2026 18:52:33 -0300
Subject: [PATCH 02/11] Corrige bugs em labeling_utils e function_docx
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

labeling_utils.py:
- Regex de detecção da seção de referências mais precisa (refer[eê]nci|references?)
- resp_json inicializado antes do bloco condicional (evitava UnboundLocalError)
- Escapa '<' literal em append_fragment para não quebrar o parser XML
- Guarda None em proccess_special_content para search_special_id retornando None

function_docx.py:
- is_numPr inicializado antes do loop (tabelas após listas eram descartadas
  silenciosamente por flag herdada da iteração anterior)
- Parágrafos adicionados a content independente de tabelas adjacentes

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 markup_doc/labeling_utils.py | 44 ++++++++++++++++++++----------------
 markuplib/function_docx.py   |  4 ++++
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/markup_doc/labeling_utils.py b/markup_doc/labeling_utils.py
index f66d3d6..4ff3dbe 100644
--- a/markup_doc/labeling_utils.py
+++ b/markup_doc/labeling_utils.py
@@ -773,6 +773,17 @@ def create_labeled_object2(i, item, state, sections):
         obj["type"] = "paragraph"
         obj["value"] = {"label": state["label"], "paragraph": item.get("text")}
 
+    if state.get("body") and re.search(
+        r"^(refer[eê]nci|references?)\s*$", item.get("text").strip().lower()
+    ):
+        state["label"] = "<sec>"
+        state["body"] = False
+        state["back"] = True
+        result = {"label": "<sec>", "body": False, "back": True}
+        obj["type"] = "paragraph"
+        obj["value"] = {"label": state["label"], "paragraph": item.get("text")}
+
+
     if not result:
         result = {"label": "<p>", "body": state["body"], "back": state["back"]}
         state["label"] = result.get("label")
@@ -874,12 +885,12 @@ def get_data_first_block(text, metadata, user_id):
         "Content-Type": "application/json",
     }
 
+    resp_json = {}
     response = requests.post(url, json=payload, headers=headers)
 
     if response.status_code == 200:
         response_json = response.json()
         message_str = response_json["message"]
-
         resp_json = json.loads(message_str)
 
     return resp_json
@@ -1279,6 +1290,7 @@ def append_fragment(node_dest, val):
 
     clean = escape_angle_brackets_outside_tags(clean)
     clean = remove_unpaired_tags(clean)
+    clean = re.sub(r'<(?![/a-zA-Z_])', '&lt;', clean)
 
     if clean == "":
         parent = node_dest.getparent()
@@ -1351,23 +1363,17 @@ def proccess_special_content(text, data_body):
     res = []
     dict_type = {"f": "fig", "t": "table", "e": "disp-formula"}
 
-    try:
-        for match in re.finditer(
-            pattern, text, re.IGNORECASE | re.UNICODE | re.VERBOSE
-        ):
-            label = match.group(0)
-
-            id = search_special_id(data_body, label)
-
-            res.append(
-                {
-                    "label": label,
-                    "id": id,
-                    "reftype": dict_type.get(id[0].lower(), "other"),
-                }
-            )
-    except Exception as exc:
-        print(f"ERROR proccess_special_content: {exc}")
-        pass
+    for match in re.finditer(pattern, text, re.IGNORECASE | re.UNICODE | re.VERBOSE):
+        label = match.group(0)
+        id = search_special_id(data_body, label)
+        if id is None:
+            continue
+        res.append(
+            {
+                "label": label,
+                "id": id,
+                "reftype": dict_type.get(id[0].lower(), "other"),
+            }
+        )
 
     return res
diff --git a/markuplib/function_docx.py b/markuplib/function_docx.py
index f92c31d..c9646cd 100644
--- a/markuplib/function_docx.py
+++ b/markuplib/function_docx.py
@@ -339,6 +339,10 @@ def extrae_Tabla(element, rels_map, namespaces):
             is_numPr = False
             if isinstance(element, CT_P):
                 obj = {}
+                paragraph = element
+                text_paragraph = []
+                _ns_w = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
+                is_numPr = paragraph.find('.//w:numPr', namespaces=_ns_w) is not None
 
                 namespaces = {
                     "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",

From 4ca261d3be380f801aaea40941d5af48bba93343 Mon Sep 17 00:00:00 2001
From: Rossi-Luciano <luciano.rossi.lucross@gmail.com>
Date: Sun, 31 May 2026 18:52:56 -0300
Subject: [PATCH 03/11] =?UTF-8?q?Adiciona=20bot=C3=A3o=20Reprocessar=20na?=
 =?UTF-8?q?=20interface=20Wagtail?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Insere botão na barra de ações do cabeçalho nas views de edição de
ProcessedDocx e MarkupXML, com mensagem de confirmação contextual.
Usa MutationObserver como fallback para injeção no DOM quando o
cabeçalho ainda não está renderizado.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 markup_doc/static/js/xref-button.js | 70 ++++++++++++++++++++++++++++-
 1 file changed, 69 insertions(+), 1 deletion(-)

diff --git a/markup_doc/static/js/xref-button.js b/markup_doc/static/js/xref-button.js
index 5950941..2b64935 100644
--- a/markup_doc/static/js/xref-button.js
+++ b/markup_doc/static/js/xref-button.js
@@ -523,8 +523,76 @@ function get_zip() {
   
     // También en DOMContentLoaded por si basta
     document.addEventListener("DOMContentLoaded", tryAttach);
-  
+
     // Llama una vez por si ya está listo
     tryAttach();
   })();
 
+
+// Botão Reprocessar — aparece nas views de edição de ProcessedDocx e MarkupXML
+(function () {
+    var path = window.location.pathname;
+    var isProcessedDocx = path.indexOf('processeddocx/edit/') !== -1;
+    var isMarkupXml = path.indexOf('markupxml/edit/') !== -1;
+
+    if (!isProcessedDocx && !isMarkupXml) return;
+
+    var match = path.match(/\/edit\/(\d+)\//);
+    if (!match) return;
+    var pk = match[1];
+
+    function makeBtn() {
+        var btn = document.createElement('button');
+        btn.type = 'button';
+        btn.id = 'reprocess-btn';
+        btn.textContent = 'Reprocessar';
+        btn.style.cssText = [
+            'padding:4px 12px',
+            'cursor:pointer',
+            'background:#e9a000',
+            'color:white',
+            'font-weight:bold',
+            'border:none',
+            'border-radius:4px',
+            'margin-left:8px',
+            'font-size:14px',
+        ].join(';');
+        btn.addEventListener('mouseover', function () { btn.style.background = '#c98000'; });
+        btn.addEventListener('mouseout', function () { btn.style.background = '#e9a000'; });
+        btn.addEventListener('click', function () {
+            var msg = isMarkupXml
+                ? 'Isso irá descartar as edições manuais e reprocessar o DOCX original. Continuar?'
+                : 'Reprocessar este documento?';
+            if (confirm(msg)) {
+                window.location.href = '/admin/reprocess/' + pk + '/';
+            }
+        });
+        return btn;
+    }
+
+    function tryInsert() {
+        if (document.getElementById('reprocess-btn')) return true;
+        // Tenta área de ações do cabeçalho Wagtail (v5/v6)
+        var actionArea = document.querySelector('.w-slim-header__action-buttons')
+            || document.querySelector('[data-controller="w-slim-header"] .w-slim-header__title-wrapper');
+        if (actionArea) {
+            actionArea.appendChild(makeBtn());
+            return true;
+        }
+        // Fallback: insere após o primeiro botão submit (salvar)
+        var saveBtn = document.querySelector('button[type="submit"]');
+        if (saveBtn && saveBtn.parentNode) {
+            saveBtn.parentNode.insertBefore(makeBtn(), saveBtn.nextSibling);
+            return true;
+        }
+        return false;
+    }
+
+    if (!tryInsert()) {
+        var obs = new MutationObserver(function () {
+            if (tryInsert()) obs.disconnect();
+        });
+        obs.observe(document.documentElement, { childList: true, subtree: true });
+    }
+})();
+

From 5fd4849b6abbbaed58cd9bacc59b2ed9f250e17e Mon Sep 17 00:00:00 2001
From: Rossi-Luciano <luciano.rossi.lucross@gmail.com>
Date: Mon, 1 Jun 2026 09:12:48 -0300
Subject: [PATCH 04/11] Corrige IndentationError no bloco do abstract em xml.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remoção de guarda duplicada `if vals2 and vals2[0]:` que ficou sem
corpo após resolução de conflito de rebase, causando IndentationError.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 markup_doc/xml.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/markup_doc/xml.py b/markup_doc/xml.py
index e94ccf8..18ed8bc 100644
--- a/markup_doc/xml.py
+++ b/markup_doc/xml.py
@@ -477,8 +477,6 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None):
             append_fragment(node_tmp2, vals[0].value.get("paragraph"))
 
         if vals2 and vals2[0]:
-
-        if vals2:
             # Encuentra su índice original en article_docx.content
             last_index = data_t.index(vals2[0])
 

From 2d9cb26a2be7b877bd034d9d688326af2ac665db Mon Sep 17 00:00:00 2001
From: Rossi-Luciano <luciano.rossi.lucross@gmail.com>
Date: Mon, 1 Jun 2026 09:18:45 -0300
Subject: [PATCH 05/11] Corrige KeyError em reftype/refid nos blocos book e
 thesis de xml.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dois acessos diretos a values["reftype"] sobreviveram à resolução de
conflito do rebase. Substituídos por values.get("reftype") para não
lançar KeyError quando o LLM não parseia o tipo da referência.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 markup_doc/xml.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/markup_doc/xml.py b/markup_doc/xml.py
index 18ed8bc..0824ec9 100644
--- a/markup_doc/xml.py
+++ b/markup_doc/xml.py
@@ -916,7 +916,7 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None):
                 node_elem = etree.SubElement(
                     node_ref,
                     "element-citation",
-                    attrib={"publication-type": values["reftype"]},
+                    attrib={"publication-type": values.get("reftype")},
                 )
                 node_person = etree.SubElement(
                     node_elem, "person-group", attrib={"person-group-type": "author"}
@@ -1060,11 +1060,11 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None):
                 )
                 append_fragment(etree.SubElement(node_ref, "page"), values["pages"])
 
-            if values["reftype"] == "thesis":
+            if values.get("reftype") == "thesis":
                 node_elem = etree.SubElement(
                     node_ref,
                     "element-citation",
-                    attrib={"publication-type": values["reftype"]},
+                    attrib={"publication-type": values.get("reftype")},
                 )
                 node_person = etree.SubElement(
                     node_elem, "person-group", attrib={"person-group-type": "author"}

From 807e17d9bae0f276e4a4057a88cae54535f5beae Mon Sep 17 00:00:00 2001
From: Rossi-Luciano <luciano.rossi.lucross@gmail.com>
Date: Mon, 1 Jun 2026 09:44:34 -0300
Subject: [PATCH 06/11] =?UTF-8?q?Corrige=20substitui=C3=A7=C3=A3o=20de=20x?=
 =?UTF-8?q?ref=20dentro=20de=20atributos=20em=20tasks.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

O passe de xref_map em stream_data_body usava para.replace() cru,
sem segmentação. Para Vancouver superscript, chaves curtas como "2"
substituíam o dígito dentro de rid="B2" já criado, corrompendo o
atributo: rid="B<xref ref-type="bibr" rid="B2">2</xref>".

Substituído por _apply_xref_map_safe() com a mesma lógica segmentada
de xml.py: divide nos limites de <xref> existentes e aplica o replace
apenas nos segmentos de texto puro.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 markup_doc/tasks.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/markup_doc/tasks.py b/markup_doc/tasks.py
index b4f3988..167f021 100644
--- a/markup_doc/tasks.py
+++ b/markup_doc/tasks.py
@@ -443,19 +443,33 @@ def get_labels(article_id, user_id):
         stream_data = [item for item in stream_data if item not in rescued]
 
     # Apply xref_map (DOCX hyperlinks) and narrative Author (year) xrefs to body.
+    # Use segment-based replacement to avoid substituting inside already-created
+    # <xref> tags (e.g. short Vancouver superscript keys like "2" would otherwise
+    # corrupt rid="B2" attribute values on the second iteration of the loop).
+    _xref_split_re = re.compile(r'(<xref[^>]*>.*?</xref>)', re.DOTALL)
+
+    def _apply_xref_map_safe(text, xmap):
+        parts = _xref_split_re.split(text)
+        result = []
+        for i, part in enumerate(parts):
+            if i % 2 != 0:
+                result.append(part)
+                continue
+            for cit_text, rid in sorted(xmap.items(), key=lambda x: -len(x[0])):
+                part = part.replace(
+                    cit_text,
+                    f'<xref ref-type="bibr" rid="{rid}">{cit_text}</xref>',
+                )
+            result.append(part)
+        return ''.join(result)
+
     for item in stream_data_body:
         if item.get('value', {}).get('label') == '<p>':
             para = item['value'].get('paragraph', '') or ''
             if not para:
                 continue
-            # 1. Dict-based from DOCX hyperlinks
             if xref_map:
-                for cit_text, rid in sorted(xref_map.items(), key=lambda x: -len(x[0])):
-                    para = para.replace(
-                        cit_text,
-                        f'<xref ref-type="bibr" rid="{rid}">{cit_text}</xref>',
-                    )
-            # 2. Narrative "Author (year)" citations
+                para = _apply_xref_map_safe(para, xref_map)
             para = text_xref_fn(para)
             item['value']['paragraph'] = para
 

From 74951f89e11b4c7ca8211c71c24394b108e23c15 Mon Sep 17 00:00:00 2001
From: Rossi-Luciano <luciano.rossi.lucross@gmail.com>
Date: Mon, 1 Jun 2026 09:49:49 -0300
Subject: [PATCH 07/11] =?UTF-8?q?Corrige=20corrup=C3=A7=C3=A3o=20de=20atri?=
 =?UTF-8?q?butos=20xref=20em=20substitui=C3=A7=C3=B5es=20sequenciais?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

O bug: _apply_xref_map acumulava replacements dentro do mesmo segmento.
Após substituir "20" → <xref rid="B20">20</xref>, a iteração seguinte
substituía "2" dentro do próprio rid="B20" já criado, produzindo
rid="B<xref rid="B2">2</xref>0" — XML inválido.

Fix: um _apply_to_segments por citação (não um for-loop dentro do
segmento). A cada iteração, os <xref> criados tornam-se fronteiras
que protegem as iterações seguintes.

Aplicado em tasks.py (_apply_xref_map_safe) e xml.py (_apply_xref_map).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 markup_doc/tasks.py | 25 ++++++++++++-------------
 markup_doc/xml.py   | 22 +++++++++++++---------
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/markup_doc/tasks.py b/markup_doc/tasks.py
index 167f021..08c1932 100644
--- a/markup_doc/tasks.py
+++ b/markup_doc/tasks.py
@@ -448,20 +448,19 @@ def get_labels(article_id, user_id):
     # corrupt rid="B2" attribute values on the second iteration of the loop).
     _xref_split_re = re.compile(r'(<xref[^>]*>.*?</xref>)', re.DOTALL)
 
-    def _apply_xref_map_safe(text, xmap):
+    def _apply_to_seg(text, fn):
         parts = _xref_split_re.split(text)
-        result = []
-        for i, part in enumerate(parts):
-            if i % 2 != 0:
-                result.append(part)
-                continue
-            for cit_text, rid in sorted(xmap.items(), key=lambda x: -len(x[0])):
-                part = part.replace(
-                    cit_text,
-                    f'<xref ref-type="bibr" rid="{rid}">{cit_text}</xref>',
-                )
-            result.append(part)
-        return ''.join(result)
+        return ''.join(fn(p) if i % 2 == 0 else p for i, p in enumerate(parts))
+
+    def _apply_xref_map_safe(text, xmap):
+        # Apply one citation at a time so that <xref> tags created by earlier
+        # iterations become boundaries for later, shorter keys.
+        for cit_text, rid in sorted(xmap.items(), key=lambda x: -len(x[0])):
+            replacement = f'<xref ref-type="bibr" rid="{rid}">{cit_text}</xref>'
+            text = _apply_to_seg(
+                text, lambda seg, ct=cit_text, r=replacement: seg.replace(ct, r)
+            )
+        return text
 
     for item in stream_data_body:
         if item.get('value', {}).get('label') == '<p>':
diff --git a/markup_doc/xml.py b/markup_doc/xml.py
index 0824ec9..ef70bbb 100644
--- a/markup_doc/xml.py
+++ b/markup_doc/xml.py
@@ -28,15 +28,19 @@ def _apply_to_segments(text, fn):
 
 
 def _apply_xref_map(paragraph, xref_map):
-    """Apply xref_map replacements segment-by-segment to avoid double-wrapping."""
-    def replace_in_segment(seg):
-        for cit_text, rid in sorted(xref_map.items(), key=lambda x: -len(x[0])):
-            seg = seg.replace(
-                cit_text,
-                f'<xref ref-type="bibr" rid="{rid}">{cit_text}</xref>',
-            )
-        return seg
-    return _apply_to_segments(paragraph, replace_in_segment)
+    """Apply xref_map replacements one citation at a time.
+
+    Each citation is applied via a fresh _apply_to_segments pass so that
+    <xref> tags created by earlier iterations are respected as boundaries
+    for later, shorter keys (e.g. "20" is replaced first, then "2" must
+    not touch the already-created rid="B20" attribute).
+    """
+    for cit_text, rid in sorted(xref_map.items(), key=lambda x: -len(x[0])):
+        replacement = f'<xref ref-type="bibr" rid="{rid}">{cit_text}</xref>'
+        paragraph = _apply_to_segments(
+            paragraph, lambda seg, ct=cit_text, r=replacement: seg.replace(ct, r)
+        )
+    return paragraph
 
 
 def _apply_proccess_labeled_text(paragraph, data_back):

From 71aaca6f94da55ca14397a713902c2e99b6623fa Mon Sep 17 00:00:00 2001
From: Rossi-Luciano <luciano.rossi.lucross@gmail.com>
Date: Mon, 1 Jun 2026 09:54:01 -0300
Subject: [PATCH 08/11] Adiciona guards .get() em acessos a figid, figlabel,
 tabid, eid em xml.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Artigos com figuras sem figlabel levantavam KeyError. Substituídos por
.get() com fallback vazio para todos os campos opcionais de elementos
especiais (fig, table, formula).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 markup_doc/xml.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/markup_doc/xml.py b/markup_doc/xml.py
index ef70bbb..d593a75 100644
--- a/markup_doc/xml.py
+++ b/markup_doc/xml.py
@@ -654,7 +654,7 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None):
                 node_list.append(child)
 
         if d["value"]["label"] == "<table>" or d["value"]["label"] == "<table-caption>":
-            attrib = {"id": d["value"]["tabid"]}
+            attrib = {"id": d["value"].get("tabid", "")}
 
             if subsec:
                 node_p = etree.SubElement(node_sec, "p")
@@ -694,7 +694,7 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None):
             append_fragment(node_fnp, d["value"]["paragraph"])
 
         if d["value"]["label"] == "<fig>":
-            attrib = {"id": d["value"]["figid"]}
+            attrib = {"id": d["value"].get("figid", "")}
 
             if subsec:
                 node_p = etree.SubElement(node_sec, "p")
@@ -703,11 +703,9 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None):
                 node_p = etree.SubElement(node, "p")
                 node_fig = etree.SubElement(node_p, "fig", attrib=attrib)
 
-            etree.SubElement(node_fig, "label").text = d["value"]["figlabel"]
+            etree.SubElement(node_fig, "label").text = d["value"].get("figlabel")
             node_caption = etree.SubElement(node_fig, "caption")
-            etree.SubElement(node_caption, "title").text = (
-                d["value"]["title"] if "title" in d["value"] else None
-            )
+            etree.SubElement(node_caption, "title").text = d["value"].get("title")
 
             Image = get_image_model()
             image_id = d["value"]["image"]
@@ -728,7 +726,7 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None):
             append_fragment(node_attrib, d["value"]["paragraph"])
 
         if d["value"]["label"] == "<disp-formula>":
-            attrib = {"id": d["value"]["eid"]}
+            attrib = {"id": d["value"].get("eid", "")}
 
             if subsec:
                 node_p = etree.SubElement(node_sec, "p")
@@ -745,7 +743,7 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None):
                     append_fragment(node_f, c["value"])
 
         if d["value"]["label"] == "<inline-formula>":
-            attrib = {"id": d["value"]["eid"]}
+            attrib = {"id": d["value"].get("eid", "")}
 
             if subsec:
                 node_p = etree.SubElement(node_sec, "p")

From 57bcbbc18fe4574f88c2833e3872f8c600c7ae99 Mon Sep 17 00:00:00 2001
From: Rossi-Luciano <luciano.rossi.lucross@gmail.com>
Date: Mon, 1 Jun 2026 09:56:07 -0300
Subject: [PATCH 09/11] Corrige KeyError em search_special_id para
 figid/figlabel/tabid/tablabel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Figuras e tabelas sem rótulo levantavam KeyError em search_special_id.
Substituídos por .get() com fallback vazio e guards de comprimento antes
de indexar figid/tabid.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 markup_doc/labeling_utils.py | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/markup_doc/labeling_utils.py b/markup_doc/labeling_utils.py
index 4ff3dbe..af790ff 100644
--- a/markup_doc/labeling_utils.py
+++ b/markup_doc/labeling_utils.py
@@ -1028,22 +1028,30 @@ def search_special_id(data_body, label):
             clean_label = re.sub(r"^[\s\.,;:–—-]+", "", label).capitalize()
 
             if d["type"] == "image":
-                if clean_label == data["figlabel"]:
-                    return data.get("figid")
+                figlabel = data.get("figlabel") or ""
+                figid = data.get("figid") or ""
+                if clean_label == figlabel:
+                    return figid or None
                 if (
-                    data["figid"][0] == clean_label.lower()[0]
-                    and data["figid"][1] in clean_label.lower()
+                    figid
+                    and len(figid) > 1
+                    and figid[0] == clean_label.lower()[:1]
+                    and figid[1] in clean_label.lower()
                 ):
-                    return data.get("figid")
+                    return figid
 
             if d["type"] == "table":
-                if clean_label == data["tablabel"]:
-                    return data.get("tabid")
+                tablabel = data.get("tablabel") or ""
+                tabid = data.get("tabid") or ""
+                if clean_label == tablabel:
+                    return tabid or None
                 if (
-                    data["tabid"][0] == clean_label.lower()[0]
-                    and data["tabid"][1] in clean_label.lower()
+                    tabid
+                    and len(tabid) > 1
+                    and tabid[0] == clean_label.lower()[:1]
+                    and tabid[1] in clean_label.lower()
                 ):
-                    return data.get("tabid")
+                    return tabid
 
     for d in data_body:
         if d["type"] in ["compound_paragraph"]:

From cb444dd5e99a8f849630866196fffb48d6522c9a Mon Sep 17 00:00:00 2001
From: Rossi-Luciano <luciano.rossi.lucross@gmail.com>
Date: Mon, 1 Jun 2026 09:57:41 -0300
Subject: [PATCH 10/11] =?UTF-8?q?Corrige=20AttributeError=20em=20bloco=20<?=
 =?UTF-8?q?list>=20quando=20regex=20n=C3=A3o=20encontra=20list-type?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fallback para "bullet" quando o parágrafo não contém o padrão esperado.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 markup_doc/xml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/markup_doc/xml.py b/markup_doc/xml.py
index d593a75..76f7535 100644
--- a/markup_doc/xml.py
+++ b/markup_doc/xml.py
@@ -627,7 +627,7 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None):
 
         if d["value"]["label"] == "<list>":
             re_search = re.search(r'list list-type="(.*?)"\]', d["value"]["paragraph"])
-            list_type = re_search.group(1)
+            list_type = re_search.group(1) if re_search else "bullet"
             attrib = {"list-type": list_type}
 
             if subsec:

From 30b4b3a948f6999f2c7bda5ec0b705654f73d24c Mon Sep 17 00:00:00 2001
From: Rossi-Luciano <luciano.rossi.lucross@gmail.com>
Date: Mon, 1 Jun 2026 10:05:39 -0300
Subject: [PATCH 11/11] =?UTF-8?q?Corrige=20AttributeError=20na=20extra?=
 =?UTF-8?q?=C3=A7=C3=A3o=20de=20conte=C3=BAdo=20de=20lista=20em=20xml.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fallback para string vazia quando o regex de content_list não encontra
o padrão [list ...][/list] no parágrafo.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 markup_doc/xml.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/markup_doc/xml.py b/markup_doc/xml.py
index 76f7535..537efb6 100644
--- a/markup_doc/xml.py
+++ b/markup_doc/xml.py
@@ -642,7 +642,7 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None):
                 d["value"]["paragraph"],
                 re.DOTALL,
             )
-            content_list = content_list.group(1)
+            content_list = content_list.group(1) if content_list else ""
             node_list_text = content_list.replace(
                 "[list-item]", "<list-item><p>"
             ).replace("[/list-item]", "</p></list-item>")