From f5930dba4f885e2d0e5ce27fed923e48e01d5909 Mon Sep 17 00:00:00 2001
From: Edgar <eduranm@dgb.unam.mx>
Date: Thu, 21 May 2026 00:38:35 -0600
Subject: [PATCH 01/13] =?UTF-8?q?C=C3=B3digo=20faltante=20de=20issues,=20d?=
 =?UTF-8?q?efault=20size=2012,=20salto=20de=20l=C3=ADnea=20en=20abstract?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 markuplib/function_docx.py | 40 +++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/markuplib/function_docx.py b/markuplib/function_docx.py
index c9646cd..628a063 100644
--- a/markuplib/function_docx.py
+++ b/markuplib/function_docx.py
@@ -400,15 +400,16 @@ def extrae_Tabla(element, rels_map, namespaces):
                         mathml_root, pretty_print=True, encoding="unicode"
                     )
 
+                if not obj_image:
+                    paragraph = element
+                    text_paragraph = []
+
                 if not obj_image:
                     paragraph = element
                     text_paragraph = []
 
                     # Determina si es parte de una lista
-                    is_numPr = (
-                        paragraph.find(".//w:numPr", namespaces=paragraph.nsmap)
-                        is not None
-                    )
+                    is_numPr = paragraph.find('.//w:numPr', namespaces=paragraph.nsmap) is not None
 
                     # obtiene id y nivel
                     if is_numPr:
@@ -457,13 +458,10 @@ def extrae_Tabla(element, rels_map, namespaces):
                                 if t_elem is not None and t_elem.text:
                                     text_paragraph.append(t_elem.text)
 
-                        elif (
-                            child.tag
-                            == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r"
-                        ):
-                            namespaces = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
-                            sz_element = child.find(".//w:sz", namespaces=child.nsmap)
-                            obj["font_size"] = 0
+                        elif child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r':
+                            namespaces = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
+                            sz_element = child.find('.//w:sz', namespaces=child.nsmap)
+                            obj['font_size'] = 12
 
                             if sz_element is None:
                                 p_pr = paragraph.find(
@@ -607,11 +605,23 @@ def extrae_Tabla(element, rels_map, namespaces):
                                 first_block = ""
                                 tmp_content = []
                                 abstract_mode = False
+                                abstract_started = False
 
                                 for c in content:
                                     if abstract_mode:
-                                        if c["text"] == "" or c["spacing"] is True:
+                                        if not abstract_started:
+                                            if c['text'] == '' or c['spacing'] is True:
+                                                continue
+                                            else:
+                                                abstract_started = True
+                                                tmp_content.append(c)
+                                                continue
+                                        
+                                        # empezó el abstract: sí encuentra vacío marca fin
+                                        if c['text'] == '' or c['spacing'] is True:
                                             abstract_mode = False
+                                            abstract_started = False
+                                            continue
                                         else:
                                             tmp_content.append(c)
                                             continue
@@ -619,9 +629,11 @@ def extrae_Tabla(element, rels_map, namespaces):
                                     if "paraph" in c:
                                         tmp_content.append(c)
                                         abstract_mode = False
-                                        if c["paraph"] == "<abstract>":
+                                        abstract_started = False
+                                        if c['paraph'] == '<abstract>':
                                             abstract_mode = True
-                                            continue
+                                            abstract_started = False
+                                            continue                                        
                                     else:
                                         if "text" in c:
                                             first_block = first_block + "\n" + c["text"]

From 789df95093a1b27f308df461ca8031cfa15866fd Mon Sep 17 00:00:00 2001
From: Edgar <eduranm@dgb.unam.mx>
Date: Thu, 21 May 2026 00:40:34 -0600
Subject: [PATCH 02/13] =?UTF-8?q?Funci=C3=B3n=20para=20revisar=20orden=20a?=
 =?UTF-8?q?lfab=C3=A9tico=20en=20p=C3=A1rrafos,=20detecci=C3=B3n=20t=C3=AD?=
 =?UTF-8?q?tuuulo=20referencias?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 markup_doc/labeling_utils.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/markup_doc/labeling_utils.py b/markup_doc/labeling_utils.py
index af790ff..2def4d2 100644
--- a/markup_doc/labeling_utils.py
+++ b/markup_doc/labeling_utils.py
@@ -2,6 +2,8 @@
 import html
 import json
 import re
+import requests
+import unicodedata
 
 import requests
 
@@ -748,6 +750,20 @@ def match_subsection(item, sections):
     )
 
 
+def normalize_text(text):
+    text = re.sub(r'<[^>]+>', '', text)  # quita etiquetas
+    text = text.strip().lower()
+    text = unicodedata.normalize('NFD', text)
+    text = ''.join(c for c in text if unicodedata.category(c) != 'Mn')
+    return text
+
+
+def comes_before_or_equal(obj1, obj2):
+    p1 = normalize_text(obj1.get('value', {}).get('paragraph', ''))
+    p2 = normalize_text(obj2.get('value', {}).get('paragraph', ''))
+    return p1 <= p2
+
+
 def create_labeled_object2(i, item, state, sections):
     obj = {}
     result = None
@@ -784,6 +800,23 @@ def create_labeled_object2(i, item, state, sections):
         obj["value"] = {"label": state["label"], "paragraph": item.get("text")}
 
 
+    text = item.get('text', '').strip().lower()
+
+    is_references_title = bool(re.fullmatch(
+        r"(?:referencias|references|referências)\s*[:.]?",
+        text
+    ))
+
+    if state.get('body') and is_references_title:  
+        state['label'] = '<sec>'
+        state['body'] = False
+        state['back'] = True
+        obj['type'] = 'paragraph'
+        obj['value'] = {
+            'label': state['label'],
+            'paragraph': item.get('text')
+        }
+    
     if not result:
         result = {"label": "<p>", "body": state["body"], "back": state["back"]}
         state["label"] = result.get("label")

From e4bbc584f4faef13af196d09f09f46982ecf0d27 Mon Sep 17 00:00:00 2001
From: Edgar <eduranm@dgb.unam.mx>
Date: Thu, 21 May 2026 00:41:49 -0600
Subject: [PATCH 03/13] Identificar texto posterior a referencias

---
 markup_doc/tasks.py | 48 +++++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/markup_doc/tasks.py b/markup_doc/tasks.py
index 130c16d..65db63c 100644
--- a/markup_doc/tasks.py
+++ b/markup_doc/tasks.py
@@ -27,6 +27,8 @@
     process_reference,
     process_references,
     split_in_three,
+    create_special_content_object,
+    comes_before_or_equal
 )
 from markup_doc.models import MarkupXML, ProcessStatus, UploadDocx
 from markup_doc.sync_api import sync_issues_from_api, sync_journals_from_api
@@ -206,6 +208,9 @@ def get_labels(article_id, user_id):
     next_item = None
     obj_reference = []
     llm_first_block = None
+    obj_postreference = []
+    last_obj = None
+    llama_model = False
 
     for i, item in enumerate(content):
         if next_item:
@@ -369,14 +374,11 @@ def get_labels(article_id, user_id):
             stream_data_body.append(obj)
             continue
 
-        if item.get("text") is None or item.get("text") == "":
-            state["label_next"] = (
-                state["label_next_reset"] if state["reset"] else state["label_next"]
-            )
-            if state["back"]:
-                state["back"] = False
-                state["body"] = False
-                state["references"] = True
+        if item.get('text') is None or item.get('text') == '':
+            state['label_next'] = state['label_next_reset'] if state['reset'] else state['label_next']
+            if state['back'] and num_ref > 0:
+                state['body'] = False
+                state['references'] = True
         else:
             obj, result, state = create_labeled_object2(i, item, state, sections)
 
@@ -403,20 +405,23 @@ def get_labels(article_id, user_id):
                             stream_data.append(obj)
                     else:
                         stream_data_body.append(obj)
-                elif state["back"]:
-                    if state["label"] == "<sec>":
+                elif state['back']:
+                    if state['references']:
+                        obj_postreference.append(obj)
+                    elif state['label'] == '<sec>':
                         stream_data_back.append(obj)
-                    if state["label"] == "<p>":
-                        num_ref = num_ref + 1
-                        # obj = {}#process_reference(num_ref, obj, user_id)
-                        obj_reference.append(
-                            {
-                                "num_ref": num_ref,
-                                "obj": obj,
-                                "text": obj["value"]["paragraph"],
-                            }
-                        )
-                    # stream_data_back.append(obj)
+                    elif state['label'] == '<p>':
+                        if last_obj is not None and not re.search(r"^(refer)",last_obj.get('value', {}).get('paragraph', '').strip().lower()):
+                            if comes_before_or_equal(last_obj, obj):
+                                num_ref = num_ref + 1
+                                obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],})
+                            else:
+                                obj_postreference.append(obj)
+                                state['references'] = True
+                        else:
+                            num_ref = num_ref + 1
+                            obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],})
+                        last_obj = obj
                 else:
                     stream_data.append(obj)
 
@@ -455,6 +460,7 @@ def get_labels(article_id, user_id):
                     output.extend(parsed)  # Agrega a la lista de salida
 
         stream_data_back.extend(process_references(num_refs, output))
+        stream_data_back.extend(obj_postreference)
 
     # data_front is never iterated inside get_xml — rescue any <p> items that the
     # state machine left in stream_data (body paragraphs misclassified as front

From ddd4c85d0a5bcf6ba522b4e63520432e43cbc54c Mon Sep 17 00:00:00 2001
From: Edgar <eduranm@dgb.unam.mx>
Date: Thu, 21 May 2026 00:43:38 -0600
Subject: [PATCH 04/13] Texto en back que no es referencia

---
 markup_doc/xml.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/markup_doc/xml.py b/markup_doc/xml.py
index 64c05ab..ef26904 100644
--- a/markup_doc/xml.py
+++ b/markup_doc/xml.py
@@ -861,15 +861,18 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None):
                 node_p.append(child)
 
     for i, d in enumerate(data_back):
-        if d["value"]["label"] == "<sec>":
-            node_tit = etree.SubElement(node_reflist, "title")
-            append_fragment(node_tit, d["value"]["paragraph"])
-        if d["value"]["label"] == "<p>":
-            values = d["value"]
-            refid = values.get("refid") or f"B{i + 1}"
-            node_ref = etree.SubElement(node_reflist, "ref", attrib={"id": refid})
-            node_mix = etree.SubElement(node_ref, "mixed-citation")
-            append_fragment(node_mix, values["paragraph"])
+        if d['value']['label'] == '<sec>':
+            node_tit = etree.SubElement(node_reflist, 'title')
+            append_fragment(node_tit, d['value']['paragraph'])
+        if d['value']['label'] == '<p>':
+            if 'refid' not in d['value']:
+                continue
+            values = d['value']
+            node_ref = etree.SubElement(node_reflist, 'ref', attrib={"id": values['refid']})
+            #node_label = etree.SubElement(node_ref, 'label')
+            #append_fragment(node_label, values['refid'].replace('B', ''))
+            node_mix = etree.SubElement(node_ref, 'mixed-citation')
+            append_fragment(node_mix, values['paragraph'])
 
             if values.get("reftype") == "journal":
                 node_elem = etree.SubElement(

From 62e3eda80955f7ab427069f4bb1f3c9943c9f28d Mon Sep 17 00:00:00 2001
From: Edgar <eduranm@dgb.unam.mx>
Date: Thu, 28 May 2026 10:04:41 -0600
Subject: [PATCH 05/13] =?UTF-8?q?Detecci=C3=B3n=20de=20introduccion=20para?=
 =?UTF-8?q?=20tomar=20estilo=20de=20referencia=20de=20secciones,=20soluci?=
 =?UTF-8?q?=C3=B3n=20para=20tablas=20o=20imagenes=20descartadas=20despu?=
 =?UTF-8?q?=C3=A9s=20de=20una=20lista?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 markuplib/function_docx.py | 579 ++++++++++++++++++++-----------------
 1 file changed, 308 insertions(+), 271 deletions(-)

diff --git a/markuplib/function_docx.py b/markuplib/function_docx.py
index 628a063..33a8162 100644
--- a/markuplib/function_docx.py
+++ b/markuplib/function_docx.py
@@ -175,6 +175,29 @@ def match_paragraph(text):
                 return "<date-received>"
             return False
 
+        def is_intro_heading(text):
+            if not text:
+                return False
+
+            text = clean_labels(text)
+            text = re.sub(r'<\/?italic>', '', text, flags=re.I)
+            text = text.strip().lower()
+
+            # Quita numeración tipo:
+            # 1. Introducción
+            # 1 Introducción
+            # 1.1 Introduction
+            text = re.sub(r'^\s*\d+(?:\.\d+)*\.?\s+', '', text)
+
+            # Quita dos puntos o punto final
+            text = re.sub(r'[:.]$', '', text).strip()
+
+            return text in [
+                'introducción',
+                'introduction',
+                'introdução'
+            ]
+
         def matches_section(a, b):
             try:
                 return (
@@ -191,22 +214,22 @@ def section_priority(sections):
 
         def identify_section(sections, size, bold, text):
             if size == 0:
-                return sections
+                return sections, None
 
             isupper = text.isupper()
-            s_id = {"size": size, "bold": bold, "isupper": isupper, "count": 0}
+            s_id = {"size": size, "bold": bold, "isupper": isupper, "count": 1}
 
             if len(sections) == 0:
                 sections.append(s_id)
-                return sections
+                return sections, s_id
 
             for section in sections:
                 if matches_section(s_id, section):
-                    section["count"] += 1
-                    return sections
+                    section['count'] += 1
+                    return sections, section
 
             sections.append(s_id)
-            return sections
+            return sections, s_id
 
         def clean_labels(text):
             # Eliminar etiquetas cuadradas tipo [ ... ] con espacios opcionales
@@ -327,6 +350,7 @@ def extrae_Tabla(element, rels_map, namespaces):
         images = []
         found_fb = False
         review_fb = True
+        intro_section = None
         # Palabras a buscar como indicador del primer bloque
         start_text = ["introducción", "introduction", "introdução"]
 
@@ -337,13 +361,9 @@ def extrae_Tabla(element, rels_map, namespaces):
 
         for element in doc.element.body:
             is_numPr = False
-            if isinstance(element, CT_P):
-                obj = {}
-                paragraph = element
-                text_paragraph = []
-                _ns_w = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
-                is_numPr = paragraph.find('.//w:numPr', namespaces=_ns_w) is not None
+            obj = {}
 
+            if isinstance(element, CT_P):
                 namespaces = {
                     "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
                     "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
@@ -375,16 +395,29 @@ def extrae_Tabla(element, rels_map, namespaces):
                             if image_name not in images:
                                 images.append(image_name)
 
-                                # Guardar la imagen en Wagtail
                                 wagtail_image = ImageModel.objects.create(
                                     title=image_name,
                                     file=ContentFile(image_data, name=image_name),
                                 )
 
-                                # Referenciar la imagen guardada en el objeto
                                 obj["type"] = "image"
                                 obj["image"] = wagtail_image.id
 
+                # Si el párrafo contiene imagen, no debe depender de is_numPr
+                if obj_image:
+                    if len(current_list) > 0:
+                        current_list.append("[/list]")
+                        objl = {}
+                        objl["type"] = "list"
+                        objl["list"] = "\n".join(current_list)
+                        current_list = []
+                        content.append(objl)
+
+                    if obj.get("type") == "image":
+                        content.append(obj)
+
+                    continue
+
                 ns_math = {
                     "m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
                     "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
@@ -400,46 +433,35 @@ def extrae_Tabla(element, rels_map, namespaces):
                         mathml_root, pretty_print=True, encoding="unicode"
                     )
 
-                if not obj_image:
-                    paragraph = element
-                    text_paragraph = []
+                # Si el párrafo contiene fórmula de bloque, tampoco debe depender de is_numPr
+                if obj_formula:
+                    if len(current_list) > 0:
+                        current_list.append("[/list]")
+                        objl = {}
+                        objl["type"] = "list"
+                        objl["list"] = "\n".join(current_list)
+                        current_list = []
+                        content.append(objl)
 
-                if not obj_image:
-                    paragraph = element
-                    text_paragraph = []
+                    content.append(obj)
+                    continue
 
-                    # Determina si es parte de una lista
-                    is_numPr = paragraph.find('.//w:numPr', namespaces=paragraph.nsmap) is not None
+                paragraph = element
+                text_paragraph = []
+
+                # Determina si es parte de una lista
+                is_numPr = paragraph.find(".//w:numPr", namespaces=paragraph.nsmap) is not None
+
+                # obtiene id y nivel
+                if is_numPr:
+                    numPr = paragraph.find(".//w:numPr", namespaces=paragraph.nsmap)
+                    numId = numPr.find(".//w:numId", namespaces=paragraph.nsmap).get(namespaces_p + "val")
+                    type_matches = [(key, objt) for key, objt in list_types.items() if objt["numId"] == numId]
+
+                    # Es una lista diferente
+                    if numId != current_num_id:
+                        current_num_id = numId
 
-                    # obtiene id y nivel
-                    if is_numPr:
-                        numPr = paragraph.find(".//w:numPr", namespaces=paragraph.nsmap)
-                        numId = numPr.find(
-                            ".//w:numId", namespaces=paragraph.nsmap
-                        ).get(namespaces_p + "val")
-                        type = [
-                            (key, objt)
-                            for key, objt in list_types.items()
-                            if objt["numId"] == numId
-                        ]
-
-                        # Es una lista diferente
-                        if numId != current_num_id:
-                            current_num_id = numId
-                            if len(current_list) > 0:
-                                current_list.append("[/list]")
-                                objl = {}
-                                objl["type"] = "list"
-                                objl["list"] = "\n".join(current_list)
-                                current_list = []
-                                content.append(objl)
-                            list_type = "bullet"
-                            if type[0][1][str(0)] == "decimal":
-                                list_type = "order"
-
-                            current_list.append(f'[list list-type="{list_type}"]')
-                    else:
-                        # Se terminaron de agregar elementos a la lista
                         if len(current_list) > 0:
                             current_list.append("[/list]")
                             objl = {}
@@ -448,263 +470,278 @@ def extrae_Tabla(element, rels_map, namespaces):
                             current_list = []
                             content.append(objl)
 
-                    for child in paragraph:
-                        if (
-                            child.tag
-                            == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}hyperlink"
-                        ):
-                            for r in child.findall("w:r", namespaces=child.nsmap):
-                                t_elem = r.find("w:t", namespaces=child.nsmap)
-                                if t_elem is not None and t_elem.text:
-                                    text_paragraph.append(t_elem.text)
-
-                        elif child.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r':
-                            namespaces = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
-                            sz_element = child.find('.//w:sz', namespaces=child.nsmap)
-                            obj['font_size'] = 12
-
-                            if sz_element is None:
-                                p_pr = paragraph.find(
-                                    ".//w:rPr/w:sz", namespaces=child.nsmap
-                                )
-                                if p_pr is not None:
-                                    sz_element = p_pr.find(
-                                        ".//w:pPr", namespaces=child.nsmap
-                                    )
-
-                            if sz_element is not None:
-                                xml_string = etree.tostring(
-                                    sz_element, pretty_print=True, encoding="unicode"
-                                )
-                                size_element = objectify.fromstring(xml_string)
-                                font_size_value = size_element.get(namespaces + "val")
-                                obj["font_size"] = int(font_size_value) / 2
+                        list_type = "bullet"
 
-                            color_element = child.find(
-                                ".//w:color", namespaces=child.nsmap
-                            )
+                        if type_matches[0][1][str(0)] == "decimal":
+                            list_type = "order"
 
-                            if color_element is None:
-                                p_pr = paragraph.find(
-                                    ".//w:pPr", namespaces=child.nsmap
-                                )
-                                if p_pr is not None:
-                                    color_element = p_pr.find(
-                                        ".//w:rPr/w:color", namespaces=child.nsmap
-                                    )
-
-                            if color_element is not None:
-                                xml_string_color = etree.tostring(
-                                    color_element, pretty_print=True, encoding="unicode"
-                                )
-                                object_element = objectify.fromstring(xml_string_color)
-                                color_value = object_element.get(namespaces + "val")
-                                obj["color"] = color_value
+                        current_list.append(f'[list list-type="{list_type}"]')
 
-                            b_tag = child.find(".//w:b", namespaces=child.nsmap)
+                else:
+                    # Se terminaron de agregar elementos a la lista
+                    if len(current_list) > 0:
+                        current_list.append("[/list]")
+                        objl = {}
+                        objl["type"] = "list"
+                        objl["list"] = "\n".join(current_list)
+                        current_list = []
+                        content.append(objl)
 
-                            if b_tag is None:
-                                p_pr = paragraph.find(
-                                    ".//w:rPr/w:b", namespaces=child.nsmap
-                                )
-                                if p_pr is not None:
-                                    b_tag = p_pr.find(
-                                        ".//w:pPr", namespaces=child.nsmap
-                                    )
-
-                            if b_tag is not None:
-                                val = b_tag.get(
-                                    "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
-                                )
-                                obj["bold"] = val is None or val in [
-                                    "1",
-                                    "true",
-                                    "True",
-                                ]
-                            else:
-                                obj["bold"] = False
-
-                            i_tag = child.find(".//w:i", namespaces=child.nsmap)
-
-                            if i_tag is None:
-                                p_pr = paragraph.find(
-                                    ".//w:rPr/w:i", namespaces=child.nsmap
-                                )
-                                if p_pr is not None:
-                                    i_tag = p_pr.find(
-                                        ".//w:pPr", namespaces=child.nsmap
-                                    )
-
-                            if i_tag is not None:
-                                val = i_tag.get(
-                                    "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
-                                )
-                                obj["italic"] = val is None or val in [
-                                    "1",
-                                    "true",
-                                    "True",
-                                ]
-                            else:
-                                obj["italic"] = False
-
-                            s_tag = child.find(".//w:spacing", namespaces=child.nsmap)
-
-                            if s_tag is None:
-                                p_pr = paragraph.find(
-                                    ".//w:rPr/w:spacing", namespaces=child.nsmap
-                                )
-                                if p_pr is not None:
-                                    s_tag = p_pr.find(
-                                        ".//w:pPr", namespaces=child.nsmap
-                                    )
-
-                            if s_tag is not None:
-                                val = s_tag.get(
-                                    "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}before"
-                                )
-                                obj["spacing"] = not (val is None)
-                            else:
-                                obj["spacing"] = False
+                for child in paragraph:
+                    if child.tag == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}hyperlink":
+                        for r in child.findall("w:r", namespaces=child.nsmap):
+                            t_elem = r.find("w:t", namespaces=child.nsmap)
+
+                            if t_elem is not None and t_elem.text:
+                                text_paragraph.append(t_elem.text)
+
+                    elif child.tag == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r":
+                        namespaces = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
+                        sz_element = child.find(".//w:sz", namespaces=child.nsmap)
+                        obj["font_size"] = 12
+
+                        if sz_element is None:
+                            p_pr = paragraph.find(".//w:rPr/w:sz", namespaces=child.nsmap)
+
+                            if p_pr is not None:
+                                sz_element = p_pr.find(".//w:pPr", namespaces=child.nsmap)
+
+                        if sz_element is not None:
+                            xml_string = etree.tostring(sz_element, pretty_print=True, encoding="unicode")
+                            size_element = objectify.fromstring(xml_string)
+                            font_size_value = size_element.get(namespaces + "val")
+                            obj["font_size"] = int(font_size_value) / 2
+
+                        color_element = child.find(".//w:color", namespaces=child.nsmap)
 
-                            clean_text = clean_labels(child.text)
+                        if color_element is None:
+                            p_pr = paragraph.find(".//w:pPr", namespaces=child.nsmap)
+
+                            if p_pr is not None:
+                                color_element = p_pr.find(".//w:rPr/w:color", namespaces=child.nsmap)
+
+                        if color_element is not None:
+                            xml_string_color = etree.tostring(color_element, pretty_print=True, encoding="unicode")
+                            object_element = objectify.fromstring(xml_string_color)
+                            color_value = object_element.get(namespaces + "val")
+                            obj["color"] = color_value
+
+                        b_tag = child.find(".//w:b", namespaces=child.nsmap)
+
+                        if b_tag is None:
+                            p_pr = paragraph.find(".//w:rPr/w:b", namespaces=child.nsmap)
+
+                            if p_pr is not None:
+                                b_tag = p_pr.find(".//w:pPr", namespaces=child.nsmap)
+
+                        if b_tag is not None:
+                            val = b_tag.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')
+                            obj["bold"] = (val is None or val in ["1", "true", "True"])
+                        else:
+                            obj["bold"] = False
+
+                        i_tag = child.find(".//w:i", namespaces=child.nsmap)
+
+                        if i_tag is None:
+                            p_pr = paragraph.find(".//w:rPr/w:i", namespaces=child.nsmap)
+
+                            if p_pr is not None:
+                                i_tag = p_pr.find(".//w:pPr", namespaces=child.nsmap)
+
+                        if i_tag is not None:
+                            val = i_tag.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val")
+                            obj["italic"] = (val is None or val in ["1", "true", "True"])
+                        else:
+                            obj["italic"] = False
 
-                            # identifica sección
-                            sections = identify_section(
-                                sections, obj["font_size"], obj["bold"], clean_text
+                        s_tag = child.find(".//w:spacing", namespaces=child.nsmap)
+
+                        if s_tag is None:
+                            p_pr = paragraph.find(".//w:rPr/w:spacing", namespaces=child.nsmap)
+
+                            if p_pr is not None:
+                                s_tag = p_pr.find(".//w:pPr", namespaces=child.nsmap)
+
+                        if s_tag is not None:
+                            val = s_tag.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}before")
+                            obj["spacing"] = not (val is None)
+                        else:
+                            obj["spacing"] = False
+
+                        clean_text = clean_labels(child.text or "")
+
+                        # Identifica sección
+                        sections, current_section = identify_section(
+                            sections,
+                            obj["font_size"],
+                            obj["bold"],
+                            clean_text
+                        )
+
+                        if intro_section is None and current_section is not None and is_intro_heading(clean_text):
+                            intro_section = current_section.copy()
+
+                        if obj["italic"]:
+                            text_paragraph.append(
+                                "<italic>" + clean_text + "</italic>" + (f" {hiperlinks}" if hiperlinks else "")
+                            )
+                        else:
+                            text_paragraph.append(
+                                clean_text + (f" {hiperlinks}" if hiperlinks else "")
                             )
 
-                            if obj["italic"]:
-                                text_paragraph.append(
-                                    "<italic>"
-                                    + clean_text
-                                    + "</italic>"
-                                    + (f" {hiperlinks}" if hiperlinks else "")
-                                )
-                            else:
-                                text_paragraph.append(
-                                    clean_text
-                                    + (f" {hiperlinks}" if hiperlinks else "")
-                                )
+                        paraph = match_paragraph(clean_text)
 
-                            paraph = match_paragraph(clean_text)
-                            if paraph:
-                                obj["paraph"] = paraph
-                                obj["type"] = paraph
+                        if paraph:
+                            obj["paraph"] = paraph
+                            obj["type"] = paraph
 
-                            if review_fb:
-                                found_fb = any(
-                                    word in clean_text.lower() for word in start_text
-                                )
+                        if review_fb:
+                            found_fb = any(word in clean_text.lower() for word in start_text)
+
+                        # Si se encontró alguna palabra, incluye todo lo anterior en un solo bloque
+                        if found_fb:
+                            found_fb = False
+                            review_fb = False
+                            found_hiperlinks = False
+
+                            first_block = ""
+                            tmp_content = []
+                            abstract_mode = False
+                            abstract_started = False
 
-                            # Si se encontró alguna palabra, incluye todo lo anterior en un sólo bloque
-                            if found_fb:
-                                found_fb = False
-                                review_fb = False
-                                found_hiperlinks = False
-                                sections = [sections[-1]]
-                                first_block = ""
-                                tmp_content = []
-                                abstract_mode = False
-                                abstract_started = False
-
-                                for c in content:
-                                    if abstract_mode:
-                                        if not abstract_started:
-                                            if c['text'] == '' or c['spacing'] is True:
-                                                continue
-                                            else:
-                                                abstract_started = True
-                                                tmp_content.append(c)
-                                                continue
-                                        
-                                        # empezó el abstract: sí encuentra vacío marca fin
-                                        if c['text'] == '' or c['spacing'] is True:
-                                            abstract_mode = False
-                                            abstract_started = False
+                            for c in content:
+                                if abstract_mode:
+                                    if not abstract_started:
+                                        if c.get("text") == "" or c.get("spacing") is True:
                                             continue
                                         else:
+                                            abstract_started = True
                                             tmp_content.append(c)
                                             continue
 
-                                    if "paraph" in c:
-                                        tmp_content.append(c)
+                                    # Ya empezó el abstract: aquí sí un vacío marca fin
+                                    if c.get("text") == "" or c.get("spacing") is True:
                                         abstract_mode = False
                                         abstract_started = False
-                                        if c['paraph'] == '<abstract>':
-                                            abstract_mode = True
-                                            abstract_started = False
-                                            continue                                        
+                                        continue
                                     else:
-                                        if "text" in c:
-                                            first_block = first_block + "\n" + c["text"]
-                                        if "table" in c:
-                                            first_block = (
-                                                first_block + "\n" + c["table"]
-                                            )
-
-                                obj_b = {}
-                                obj_b["type"] = "first_block"
-                                obj_b["text"] = first_block
-                                tmp_content.append(obj_b)
-                                content = tmp_content
-                                start_text = []
-
-                        if child.tag == f"{{{ns_math['m']}}}oMath":
-                            if "text" not in obj or not isinstance(obj["text"], list):
-                                obj["type"] = "compound"
-                                obj["text"] = []
-                            if len(text_paragraph) > 0:
-                                obj2 = {}
-                                obj2["type"] = "text"
-                                obj2["value"] = " ".join(text_paragraph)
-                                obj["text"].append(obj2)
-                                text_paragraph = []
-
-                            mathml_result = transform(child)
-                            mathml_root = etree.fromstring(str(mathml_result))
-                            self.replace_mfenced_pipe_only(mathml_root)
+                                        tmp_content.append(c)
+                                        continue
+
+                                if "paraph" in c:
+                                    tmp_content.append(c)
+                                    abstract_mode = False
+                                    abstract_started = False
+
+                                    if c["paraph"] == "<abstract>":
+                                        abstract_mode = True
+                                        abstract_started = False
+                                        continue
+
+                                else:
+                                    if "text" in c:
+                                        first_block = first_block + "\n" + c["text"]
+
+                                    if "table" in c:
+                                        first_block = first_block + "\n" + c["table"]
+
+                            obj_b = {}
+                            obj_b["type"] = "first_block"
+                            obj_b["text"] = first_block
+                            tmp_content.append(obj_b)
+                            content = tmp_content
+                            start_text = []
+
+                    if child.tag == f"{{{ns_math['m']}}}oMath":
+                        if "text" not in obj or not isinstance(obj["text"], list):
+                            obj["type"] = "compound"
+                            obj["text"] = []
+
+                        if len(text_paragraph) > 0:
                             obj2 = {}
-                            obj2["type"] = "formula"
-                            obj2["value"] = etree.tostring(
-                                mathml_root, pretty_print=True, encoding="unicode"
-                            )
+                            obj2["type"] = "text"
+                            obj2["value"] = " ".join(text_paragraph)
                             obj["text"].append(obj2)
+                            text_paragraph = []
 
-                    if "text" not in obj:
-                        obj["text"] = (" ".join(text_paragraph)).strip()
-                        clean_text = clean_labels(obj["text"])
-                        obj["text"] = clean_text
+                        mathml_result = transform(child)
+                        mathml_root = etree.fromstring(str(mathml_result))
+                        self.replace_mfenced_pipe_only(mathml_root)
 
-                        paraph = match_paragraph(obj["text"])
-                        if paraph:
-                            obj["paraph"] = paraph
-                            obj["type"] = paraph
-
-                        if is_numPr:
-                            if "font_size" in obj:
-                                del obj["font_size"]
-                            current_list.append(f'[list-item]{obj["text"]}[/list-item]')
-                    if isinstance(obj["text"], list) and len(text_paragraph) > 0:
                         obj2 = {}
-                        obj2["type"] = "text"
-                        obj2["value"] = " ".join(text_paragraph)
+                        obj2["type"] = "formula"
+                        obj2["value"] = etree.tostring(mathml_root, pretty_print=True, encoding="unicode")
                         obj["text"].append(obj2)
-                        text_paragraph = []
+
+                if "text" not in obj:
+                    obj["text"] = (" ".join(text_paragraph)).strip()
+                    clean_text = clean_labels(obj["text"])
+                    obj["text"] = clean_text
+
+                    paraph = match_paragraph(obj["text"])
+
+                    if paraph:
+                        obj["paraph"] = paraph
+                        obj["type"] = paraph
+
+                    if is_numPr:
+                        if "font_size" in obj:
+                            del obj["font_size"]
+
+                        current_list.append(f'[list-item]{obj["text"]}[/list-item]')
+
+                if isinstance(obj.get("text"), list) and len(text_paragraph) > 0:
+                    obj2 = {}
+                    obj2["type"] = "text"
+                    obj2["value"] = " ".join(text_paragraph)
+                    obj["text"].append(obj2)
+                    text_paragraph = []
+
+                # Solo los párrafos que NO son lista se agregan directamente
+                if not is_numPr:
+                    content.append(obj)
 
             elif isinstance(element, CT_Tbl):
+                # Si una tabla viene después de una lista, primero se cierra la lista
+                if len(current_list) > 0:
+                    current_list.append("[/list]")
+                    objl = {}
+                    objl["type"] = "list"
+                    objl["list"] = "\n".join(current_list)
+                    current_list = []
+                    content.append(objl)
+
                 namespaces = {
                     "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
                     "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
                     "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
                 }
 
-                table = element
                 table_data = extrae_Tabla(element, hiperlinks_info, namespaces)
+
                 obj = {}
                 obj["type"] = "table"
                 obj["table"] = table_data
 
-            if not is_numPr:
+                # Las tablas no dependen de is_numPr
                 content.append(obj)
+
+        # Si el documento termina con una lista, se cierra aquí
+        if len(current_list) > 0:
+            current_list.append("[/list]")
+            objl = {}
+            objl["type"] = "list"
+            objl["list"] = "\n".join(current_list)
+            current_list = []
+            content.append(objl)
+
         sections.sort(key=section_priority)
-        return sections, content
+
+        if intro_section is not None:
+            for index, section in enumerate(sections):
+                if matches_section(section, intro_section):
+                    sections = sections[index:]
+                    break
+
+        return sections, content
\ No newline at end of file

From cc368ddf338dc253cc5c620c7c24f4186f2b097e Mon Sep 17 00:00:00 2001
From: Edgar <eduranm@dgb.unam.mx>
Date: Thu, 28 May 2026 10:06:13 -0600
Subject: [PATCH 06/13] =?UTF-8?q?Manejo=20de=20autores=20con=20m=C3=A1s=20?=
 =?UTF-8?q?de=20una=20afiliaci=C3=B3n,=20limpieza=20de=20elementos=20que?=
 =?UTF-8?q?=20corrompen=20XML=20en=20tablas?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 markup_doc/xml.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/markup_doc/xml.py b/markup_doc/xml.py
index ef26904..11e78aa 100644
--- a/markup_doc/xml.py
+++ b/markup_doc/xml.py
@@ -688,9 +688,9 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None):
             node_table_text = d["value"]["content"]
 
             # Quitar saltos de línea y espacios extra
-            node_table_text = re.sub(r"\s*\n\s*", "", node_table_text).replace(
-                "<br>", ""
-            )
+            node_table_text = re.sub(r"\s*\n\s*", "", node_table_text).replace("<br>","")
+            node_table_text = re.sub(r"<(?![/a-zA-Z_])", "&lt;", node_table_text)
+            node_table_text = node_table_text.replace("&nbsp;", " ")
             node_table_text = re.sub(r"&(?!\w+;|#\d+;)", "&amp;", node_table_text)
 
             tabla_element = parse_xml_fragment(node_table_text)

From c14a7b8d2abca0b8cea825675ef7f0a2a7960d87 Mon Sep 17 00:00:00 2001
From: Edgar <eduranm@dgb.unam.mx>
Date: Thu, 28 May 2026 10:08:08 -0600
Subject: [PATCH 07/13] =?UTF-8?q?Quita=20la=20detecci=C3=B3n=20de=20refren?=
 =?UTF-8?q?cias=20en=20orden=20alfab=C3=A9tico,=20uso=20de=20IA=20para=20i?=
 =?UTF-8?q?dentificar=20si=20es=20una=20referencia?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 markup_doc/tasks.py | 144 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 108 insertions(+), 36 deletions(-)

diff --git a/markup_doc/tasks.py b/markup_doc/tasks.py
index 65db63c..d52dcfc 100644
--- a/markup_doc/tasks.py
+++ b/markup_doc/tasks.py
@@ -28,7 +28,7 @@
     process_references,
     split_in_three,
     create_special_content_object,
-    comes_before_or_equal
+    split_abstract_inline
 )
 from markup_doc.models import MarkupXML, ProcessStatus, UploadDocx
 from markup_doc.sync_api import sync_issues_from_api, sync_journals_from_api
@@ -219,17 +219,37 @@ def get_labels(article_id, user_id):
 
         obj = {}
         if item.get("type") in [
-            "<abstract>",
-            "<date-accepted>",
-            "<date-received>",
-            "<kwd-group>",
-        ]:
+                                    "<abstract>", 
+                                    "<date-accepted>", 
+                                    "<date-received>",
+                                    "<kwd-group>"
+                                    ]:
             if item.get("type") == "<abstract>":
-                if i + 1 < len(content):
+                inline_abstract = split_abstract_inline(item.get("text"))
+
+                if inline_abstract:
+                    abstract_title, abstract_text = inline_abstract
+
+                    obj["type"] = "paragraph"
+                    obj["value"] = {
+                        "label": "<abstract-title>",
+                        "paragraph": abstract_title
+                    }
+                    stream_data.append(obj.copy())
+
+                    obj["type"] = "paragraph_with_language"
+                    obj["value"] = {
+                        "label": "<abstract>",
+                        "paragraph": abstract_text,
+                        "language": langid.classify(abstract_text)[0] or None
+                    }
+                    stream_data.append(obj.copy())
+
+                elif i + 1 < len(content):
                     obj["type"] = "paragraph"
                     obj["value"] = {
                         "label": "<abstract-title>",
-                        "paragraph": item.get("text"),
+                        "paragraph": item.get("text")
                     }
                     stream_data.append(obj.copy())
 
@@ -377,6 +397,7 @@ def get_labels(article_id, user_id):
         if item.get('text') is None or item.get('text') == '':
             state['label_next'] = state['label_next_reset'] if state['reset'] else state['label_next']
             if state['back'] and num_ref > 0:
+                #state['back'] = False
                 state['body'] = False
                 state['references'] = True
         else:
@@ -406,37 +427,56 @@ def get_labels(article_id, user_id):
                     else:
                         stream_data_body.append(obj)
                 elif state['back']:
-                    if state['references']:
-                        obj_postreference.append(obj)
-                    elif state['label'] == '<sec>':
+                    if state['label'] == '<sec>':
                         stream_data_back.append(obj)
-                    elif state['label'] == '<p>':
-                        if last_obj is not None and not re.search(r"^(refer)",last_obj.get('value', {}).get('paragraph', '').strip().lower()):
-                            if comes_before_or_equal(last_obj, obj):
-                                num_ref = num_ref + 1
-                                obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],})
-                            else:
-                                obj_postreference.append(obj)
-                                state['references'] = True
-                        else:
-                            num_ref = num_ref + 1
-                            obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],})
-                        last_obj = obj
+                    if state['label'] == '<p>':
+                        num_ref = num_ref + 1
+                        #obj = {}#process_reference(num_ref, obj, user_id)
+                        obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],})
+                    #stream_data_back.append(obj)
                 else:
                     stream_data.append(obj)
 
-    num_refs = [item["num_ref"] for item in obj_reference]
-
     if get_llm_model_name() == "LLAMA":
         for obj_ref in obj_reference:
-            obj = process_reference(obj_ref["num_ref"], obj_ref["obj"], user_id)
-            stream_data_back.append(obj)
+            obj = process_reference(obj_ref['num_ref'], obj_ref['obj'], user_id)
+
+            is_reference = obj.get('is_reference', True)
+
+            # Por si el modelo devuelve "false" como string
+            if isinstance(is_reference, str):
+                is_reference = is_reference.lower() == 'true'
+
+            if is_reference:
+                # Opcional: quitar is_reference si no lo necesitas en el StreamField
+                obj.pop('is_reference', None)
+                stream_data_back.append(obj)
+
+            else:
+                full_text = (
+                    obj.get('full_text')
+                    or obj_ref.get('text')
+                    or obj_ref.get('obj', {}).get('text')
+                    or ''
+                )
+
+                obj_no_reference = {
+                    'type': 'paragraph',
+                    'value': {
+                        'label': '<p>',
+                        'paragraph': full_text
+                    }
+                }
+
+                obj_postreference.append(obj_no_reference)
 
     else:
         if llm_first_block is None:
             llm_first_block = LlamaService(mode="prompt", temperature=0.1)
         chunks = split_in_three(obj_reference)
-        output = []
+
+        output_reference = []
+        num_refs_reference = []
         logger.info(
             "get_labels: processando %d referências com Gemini (%d chunks)",
             len(obj_reference),
@@ -445,21 +485,53 @@ def get_labels(article_id, user_id):
 
         for chunk in chunks:
             if len(chunk) > 0:
-                text_references = (
-                    "\n".join([item["text"] for item in chunk])
-                    .replace("<italic>", "")
-                    .replace("</italic>", "")
-                )
+                text_references = "\n".join(
+                    [item["text"] for item in chunk]
+                ).replace('<italic>', '').replace('</italic>', '')
+
                 prompt_reference = create_prompt_reference(text_references)
 
                 result = llm_first_block.run(prompt_reference)
 
-                match = re.search(r"\[.*\]", result, re.DOTALL)
+                match = re.search(r'\[.*\]', result, re.DOTALL)
+
                 if match:
                     parsed = json.loads(match.group(0))
-                    output.extend(parsed)  # Agrega a la lista de salida
 
-        stream_data_back.extend(process_references(num_refs, output))
+                    for index, item_response in enumerate(parsed):
+                        if index >= len(chunk):
+                            continue
+
+                        original_item = chunk[index]
+
+                        is_reference = item_response.get('is_reference', True)
+
+                        # Por si el modelo regresa "false" como texto
+                        if isinstance(is_reference, str):
+                            is_reference = is_reference.lower() == 'true'
+
+                        if is_reference:
+                            num_refs_reference.append(original_item["num_ref"])
+                            output_reference.append(item_response)
+
+                        else:
+                            full_text = (
+                                item_response.get('full_text')
+                                or original_item.get('text')
+                                or ''
+                            )
+
+                            obj_no_reference = {
+                                'type': 'paragraph',
+                                'value': {
+                                    'label': '<p>',
+                                    'paragraph': full_text
+                                }
+                            }
+
+                            obj_postreference.append(obj_no_reference)
+
+        stream_data_back.extend(process_references(num_refs_reference, output_reference))
         stream_data_back.extend(obj_postreference)
 
     # data_front is never iterated inside get_xml — rescue any <p> items that the

From 974e668f39bdbc1a147b5ed752a9e249b444dc9a Mon Sep 17 00:00:00 2001
From: Edgar <eduranm@dgb.unam.mx>
Date: Thu, 28 May 2026 10:10:49 -0600
Subject: [PATCH 08/13] =?UTF-8?q?Detecci=C3=B3n=20de=20secciones=20de=20ac?=
 =?UTF-8?q?uerdo=20a=20la=20longitud=20del=20texto,=20limpieza=20de=20cara?=
 =?UTF-8?q?cteres=20que=20corrompen=20XML,=20no=20rompe=20el=20ciclo=20si?=
 =?UTF-8?q?=20no=20existe=20un=20id,=20detecci=C3=B3n=20de=20resumen=20y?=
 =?UTF-8?q?=20contenido=20en=20la=20misma=20l=C3=ADnea?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 markup_doc/labeling_utils.py | 96 +++++++++++++++++++++++-------------
 1 file changed, 63 insertions(+), 33 deletions(-)

diff --git a/markup_doc/labeling_utils.py b/markup_doc/labeling_utils.py
index 2def4d2..3331dda 100644
--- a/markup_doc/labeling_utils.py
+++ b/markup_doc/labeling_utils.py
@@ -739,6 +739,8 @@ def match_section(item, sections):
 
 
 def match_subsection(item, sections):
+    if len(sections) <=2:
+        return None
     return (
         {"label": "<sub-sec>", "body": True}
         if (
@@ -764,59 +766,67 @@ def comes_before_or_equal(obj1, obj2):
     return p1 <= p2
 
 
-def create_labeled_object2(i, item, state, sections):
-    obj = {}
-    result = None
+def is_probable_heading(text, max_chars=100, max_words=5):
+    if not text:
+        return False
 
-    if match_section(item, sections):
-        result = match_section(item, sections)
-        state["label"] = result.get("label")
-        state["body"] = result.get("body")
+    words = text.split()
 
-    if match_subsection(item, sections):
-        result = match_subsection(item, sections)
-        state["label"] = result.get("label")
-        state["body"] = result.get("body")
+    # Si es muy largo, probablemente es párrafo
+    if len(text) > max_chars:
+        return False
 
-    if (
-        state.get("body")
-        and re.search(r"^(refer)", item.get("text").lower())
-        and match_section(item, sections)
-    ):
-        state["label"] = "<sec>"
-        state["body"] = False
-        state["back"] = True
-        obj["type"] = "paragraph"
-        obj["value"] = {"label": state["label"], "paragraph": item.get("text")}
+    if len(words) > max_words:
+        return False
 
-    if state.get("body") and re.search(
-        r"^(refer[eê]nci|references?)\s*$", item.get("text").strip().lower()
-    ):
-        state["label"] = "<sec>"
-        state["body"] = False
-        state["back"] = True
-        result = {"label": "<sec>", "body": False, "back": True}
-        obj["type"] = "paragraph"
-        obj["value"] = {"label": state["label"], "paragraph": item.get("text")}
+    return True
 
 
-    text = item.get('text', '').strip().lower()
+def create_labeled_object2(i, item, state, sections):
+    obj = {}
+    result = None
+
+    raw_text = item.get('text', '').strip()
+    text = raw_text.lower()
 
     is_references_title = bool(re.fullmatch(
         r"(?:referencias|references|referências)\s*[:.]?",
         text
     ))
 
-    if state.get('body') and is_references_title:  
+    is_heading_candidate = is_probable_heading(raw_text)
+
+    # Si es título de referencias, debe poder pasar aunque falle otra regla
+    if is_references_title:
+        is_heading_candidate = True
+
+    if is_heading_candidate:
+        section_result = match_section(item, sections)
+
+        if section_result:
+            result = section_result
+            state['label'] = result.get('label')
+            state['body'] = result.get('body')
+
+        else:
+            subsection_result = match_subsection(item, sections)
+
+            if subsection_result:
+                result = subsection_result
+                state['label'] = result.get('label')
+                state['body'] = result.get('body')
+
+    if state.get('body') and is_references_title:
         state['label'] = '<sec>'
         state['body'] = False
         state['back'] = True
+
         obj['type'] = 'paragraph'
         obj['value'] = {
             'label': state['label'],
             'paragraph': item.get('text')
         }
-    
+
     if not result:
         result = {"label": "<p>", "body": state["body"], "back": state["back"]}
         state["label"] = result.get("label")
@@ -1324,6 +1334,7 @@ def append_fragment(node_dest, val):
     #    - quitar saltos de línea
     clean = re.sub(r"(?i)<br\s*/?>", "", val)
     clean = clean.replace("\n", "")
+    clean = re.sub(r'<(?![/a-zA-Z_])', '&lt;', clean)
 
     # normaliza entidades problemáticas
     clean = clean.replace("&nbsp;", " ")
@@ -1418,3 +1429,22 @@ def proccess_special_content(text, data_body):
         )
 
     return res
+
+
+def split_abstract_inline(text):
+    if not text:
+        return None
+
+    pattern = r'(?is)^\s*(?:<italic>)?\s*(abstract|resumen|resumo)\s*(?:</italic>)?\s*[:.]\s*(.+)$'
+    match = re.match(pattern, text)
+
+    if not match:
+        return None
+
+    abstract_title = match.group(1).strip()
+    abstract_text = match.group(2).strip()
+
+    if not abstract_text:
+        return None
+
+    return abstract_title, abstract_text
\ No newline at end of file

From 1490e5ce90d0f438e5d8890034158095a7b16a42 Mon Sep 17 00:00:00 2001
From: Edgar <eduranm@dgb.unam.mx>
Date: Thu, 28 May 2026 10:48:36 -0600
Subject: [PATCH 09/13] Cambio en mensajes para identificar referencias

---
 model_ai/messages.py       | 22 +++++++++++++++++++++-
 reference/config_gemini.py | 29 +++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/model_ai/messages.py b/model_ai/messages.py
index b21c06d..1260fe8 100644
--- a/model_ai/messages.py
+++ b/model_ai/messages.py
@@ -356,7 +356,10 @@
 
 REFERENCE_MESSAGES = [
     {   'role': 'system',
-        'content': 'You are an assistant who distinguishes the metadata of a bibliographic reference and returns it in JSON format.'
+        'content': """You are an assistant who distinguishes the metadata of a bibliographic reference and returns it in JSON format.
+                      First determine whether the paragraph is a bibliographic reference.
+                      If the paragraph is not a bibliographic reference, do not analyze its metadata. Return only full_text and is_reference with value false.
+        """
     },
     {   'role': 'user',
         'content': """
@@ -365,6 +368,9 @@
     },
     {   'role': 'assistant',
         'content': json.dumps({
+                'full_text': 'Smith, J. (2020). Understanding AI. Journal of Technology, 15(3), 45-60. https://doi.org/10.1234/jtech.2020.015',
+                'is_reference': true,
+                'reftype': 'journal',
                 'authors': [
                                 {
                                     "name": "J.",
@@ -383,6 +389,17 @@
                 'doi': "10.1234/jtech.2020.015"
         })
     },
+    {   'role': 'user',
+        'content': """
+                        Figures Captions
+                   """
+    },
+    {   'role': 'assistant',
+        'content': json.dumps({
+            'full_text': 'Caption Figures',
+            'is_reference': false
+        })
+    }
 ]
 
 REFERENCE_RESPONSE_FORMAT = {
@@ -390,6 +407,9 @@
     'schema':{
         'type': 'object',
         'properties': {
+            'full_text': {'type': 'string'},
+            'is_reference': {'type': 'boolean'},
+            'reftype': {'type': 'string', 'enum': ['journal', 'thesis', 'book', 'data', 'webpage', 'software', 'confproc']},
             'authors': {'type': 'array',
                         'items': {
                                     'type': 'object',
diff --git a/reference/config_gemini.py b/reference/config_gemini.py
index d7fb18f..7541bf4 100644
--- a/reference/config_gemini.py
+++ b/reference/config_gemini.py
@@ -4,6 +4,9 @@ def create_prompt_reference(references):
     You are an assistant who distinguishes all the components of all citations in an article with output in JSON
 
     Rules:
+    - First determine whether the paragraph is a bibliographic reference.
+    - If the paragraph is not a bibliographic reference, do not analyze its metadata. Return only full_text and is_reference with value false.
+    - If the paragraph is a bibliographic reference, set is_reference to true and extract the metadata.
     - If a DOI is present in the citation, it must be included in the doi field, and the uri field must be None. If there is no DOI, then a valid persistent URL (e.g., from a repository or publisher) must be provided in the uri field instead. One of these fields — doi or uri — must always be populated. Never leave both empty.
     - For references of type journal, the field pages must not be included, even if they appear in the original citation. Instead, the page range should be provided only in the fields fpage and lpage.
     - Consider that in book-type references, the source field generally refers to the title of the book, so do not use the title field in this case, only source.
@@ -16,6 +19,7 @@ def create_prompt_reference(references):
             "type": "object",
             "properties": {{
                 "full_text": {{"type": "string"}},
+                "is_reference": {{"type": "boolean"}},
                 "reftype": {{"type": "string", "enum": ["journal", "thesis", "book", "data", "webpage", "software", "confproc"]}},
                 "authors": {{"type": "array",
                             "items": {{
@@ -81,11 +85,16 @@ def create_prompt_reference(references):
 
     Furton EJ, Dort V, editors. Addiction and compulsive behaviors. Proceedings of the 17th Workshop for Bishops; 1999; Dallas, TX. Boston: National Catholic Bioethics Center (US); 2000. 258 p.
     
+    Figures Captions
+
+    Author's Address.
+
     Response:
 
     [
     {{
                         "full_text": "Bachman, S., J. Moat, A. W. Hill, J. de la Torre and B. Scott. 2011. Supporting Red List threat assessments with GeoCAT: geospatial conservation assessment tool. ZooKeys 150: 117-126. DOI: https://doi.org/10.3897/zookeys.150.2109",
+                        "is_reference": true,
                         "reftype": "journal",
                         "authors":  [
                             {{   "surname": "Bachman", "fname": "S." }},
@@ -105,6 +114,7 @@ def create_prompt_reference(references):
                 }},
     {{
                         "full_text": "Brunel, J. F. 1987. Sur le genre Phyllanthus L. et quelques genres voisins de la Tribu des Phyllantheae Dumort. (Euphorbiaceae, Phyllantheae) en Afrique intertropicale et à Madagascar. Thèse de doctorat de l’Université L. Pasteur. Strasbourg, France. 760 pp.",
+                        "is_reference": true,
                         "reftype": "Thesis",
                         "authors":  [
                             {{   "surname": "Brunel", "fname": "J. F." }},
@@ -118,6 +128,7 @@ def create_prompt_reference(references):
                 }},
     {{
                         "full_text": "Hernández-López, L. 1995. The endemic flora of Jalisco, Mexico: Centers of endemism and implications for conservation. Tesis de maestría. Universidad de Wisconsin. Madison, USA. 74 pp.",
+                        "is_reference": true,
                         "reftype": "Thesis",
                         "authors":  [
                             {{   "surname": "Hernández-López", "fname": "L." }},
@@ -131,6 +142,7 @@ def create_prompt_reference(references):
                 }},
     {{
                         "full_text": "Jones DL. The role of physical activity on the need for revision total knee arthroplasty in individuals with osteoarthritis of the knee [dissertation]. [Pittsburgh (PA)]: University of Pittsburgh; 2001. 436 p.",
+                        "is_reference": true,
                         "reftype": "Thesis",
                         "authors":  [
                             {{   "surname": "Jones", "fname": "DL" }},
@@ -143,6 +155,7 @@ def create_prompt_reference(references):
                 }},
     {{
                     "full_text": "Schimper, A. F. W. 1903. Plant geography upon a physiological basis. Clarendon Press. Oxford, UK. 839 pp.",
+                    "is_reference": true,
                     "reftype": "book",
                     "authors":[
                         {{   "surname": "Schimper", "fname": "A. F. W." }},
@@ -169,6 +182,7 @@ def create_prompt_reference(references):
                 }},
     {{
                     "full_text": "Hernández-López, L. 2019. Las especies endémicas de plantas en el estado de Jalisco: su distribución y conservación. Comisión Nacional para el Conocimiento y Uso de la Biodiversidad (CONABIO). Cd. Mx., México. https://doi.org/10.15468/ktvqds (consultado diciembre de 2019).",
+                    "is_reference": true,
                     "reftype": "data",
                     "authors":[
                         {{   "surname": "Hernández-López", "fname": "L." }},
@@ -182,6 +196,7 @@ def create_prompt_reference(references):
                 }},
     {{
                     "full_text": "Lucas Leão; Perobelli, Fernando Salgueiro; Ribeiro, Hilton Manoel Dias, 2024, Data for: Ação Coletiva Institucional e Consórcio Públicos Intermunicipais no Brasil, DOI: 10.48331/scielodata.5Z4TMP, SciELO Data, V1, UNF:6:Neyjad4du3rFprhupCXizA== [fileUNF]. Disponível em: https://doi.org/10.48331/scielodata",
+                    "is_reference": true,
                     "reftype": "data",
                     "authors":[
                         {{   "surname": "Leão", "fname": "Lucas" }},
@@ -198,6 +213,7 @@ def create_prompt_reference(references):
                 }},
     {{
                     "full_text": "INAFED. 2010. Enciclopedia de los Municipios y Delegaciones de México: Jalisco. Instituto Nacional para el Federalismo y el Desarrollo Municipal. http://www.inafed.gob.mx/ work/enciclopedia/EMM21puebla/index.html (consultado diciembre de 2018).",
+                    "is_reference": true,
                     "reftype": "webpage",
                     "authors":[
                         {{   "collab": "INAFED" }},
@@ -210,6 +226,7 @@ def create_prompt_reference(references):
                 }},
     {{
                     "full_text": "COB - Comitê Olímpico Brasileiro. Desafio para o corpo. Disponível em: http://www.cob.org.br/esportes/esporte.asp?id=39. (Acesso em 10 abr 2010)",
+                    "is_reference": true,
                     "reftype": "webpage",
                     "authors":[
                         {{   "collab": "COB -Comitê Olímpico Brasileiro" }},
@@ -221,6 +238,7 @@ def create_prompt_reference(references):
                 }},
     {{
                     "full_text": "Nikon Corporation. 1991-2006. NIS- Elements, version 2.33. Tokio, Japón.",
+                    "is_reference": true,
                     "reftype": "software",
                     "authors":[
                         {{   "collab": "Nikon Corporation" }},
@@ -233,6 +251,7 @@ def create_prompt_reference(references):
                 }},
     {{
                     "full_text": "Hamric, Ann B.; Spross, Judith A.; Hanson, Charlene M. Advanced practice nursing: an integrative approach. 3rd ed. St. Louis (MO): Elsevier Saunders; c2005. 979 p.",
+                    "is_reference": true,
                     "reftype": "book",
                     "authors":[
                         {{   "surname": "Hamric", "fname": "Ann B." }},
@@ -248,6 +267,7 @@ def create_prompt_reference(references):
                 }},
     {{
                     "full_text": "Calkins BM, Mendeloff AI. The epidemiology of idiopathic inflammatory bowel disease. In: Kirsner JB, Shorter RG, eds. Inflammatory bowel disease, 4th ed. Baltimore: Williams & Wilkins. 1995:31-68.",
+                    "is_reference": true,
                     "reftype": "book",
                     "authors":[
                         {{   "surname": "Calkins", "fname": "BM" }},
@@ -268,6 +288,7 @@ def create_prompt_reference(references):
                 }},
     {{
                     "full_text": "Furton EJ, Dort V, editors. Addiction and compulsive behaviors. Proceedings of the 17th Workshop for Bishops; 1999; Dallas, TX. Boston: National Catholic Bioethics Center (US); 2000. 258 p.",
+                    "is_reference": true,
                     "reftype": "confproc",
                     "authors":[
                         {{   "surname": "Furton", "fname": "EJ" }},
@@ -281,6 +302,14 @@ def create_prompt_reference(references):
                     "organization": "National Catholic Bioethics Center (US)",
                     "org_location": "Boston",
                     "pages": "258 p"
+                }},
+    {{
+                    "full_text": "Caption Figures",
+                    "is_reference": false
+                }},
+    {{
+                    "full_text": "Author's Address.",
+                    "is_reference": false
                 }}
     ]
 

From f7a7a28c7be3873746ce25ab783e4e255b746c5d Mon Sep 17 00:00:00 2001
From: Edgar <eduranm@dgb.unam.mx>
Date: Tue, 2 Jun 2026 16:04:10 -0600
Subject: [PATCH 10/13] Agrega dateiso al modelo

---
 markup_doc/models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/markup_doc/models.py b/markup_doc/models.py
index bddbc37..67868f3 100644
--- a/markup_doc/models.py
+++ b/markup_doc/models.py
@@ -471,6 +471,7 @@ class ArticleDocxMarkup(CommonControlField, ClusterableModel):
     spsversion = models.TextField(_("Sps version"), null=True, blank=True)
     artdate = models.DateField(_("Artdate"), null=True, blank=True)
     ahpdate = models.DateField(_("Ahpdate"), null=True, blank=True)
+    dateiso = models.TextField(_("Dateiso"), null=True, blank=True)
 
     file_xml = models.FileField(
         null=True,

From 85ca4af2fa5c797ec28a2fbe3aac8d22933e2ac8 Mon Sep 17 00:00:00 2001
From: Edgar <eduranm@dgb.unam.mx>
Date: Tue, 2 Jun 2026 16:04:41 -0600
Subject: [PATCH 11/13] =?UTF-8?q?Agrega=20identificaci=C3=B3n=20de=20p?=
 =?UTF-8?q?=C3=A1rrafo=20o=20referencia?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 model_ai/messages.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/model_ai/messages.py b/model_ai/messages.py
index 1260fe8..7863780 100644
--- a/model_ai/messages.py
+++ b/model_ai/messages.py
@@ -340,7 +340,7 @@
                                             'state': {'type': 'string'},
                                             'code_country': {'type': 'string'},
                                             'name_country': {'type': 'string'},
-                                            'text_aff': {'type': 'text'}
+                                            'text_aff': {'type': 'string'}
                                         },
                                         "required": [
                                             "aff", "char", "orgname", "orgdiv1", "orgdiv2",
@@ -369,7 +369,7 @@
     {   'role': 'assistant',
         'content': json.dumps({
                 'full_text': 'Smith, J. (2020). Understanding AI. Journal of Technology, 15(3), 45-60. https://doi.org/10.1234/jtech.2020.015',
-                'is_reference': true,
+                'is_reference': 'true',
                 'reftype': 'journal',
                 'authors': [
                                 {
@@ -397,7 +397,7 @@
     {   'role': 'assistant',
         'content': json.dumps({
             'full_text': 'Caption Figures',
-            'is_reference': false
+            'is_reference': 'false'
         })
     }
 ]

From 972c902d128bdae280dba5636cb2a65418294821 Mon Sep 17 00:00:00 2001
From: Edgar <eduranm@dgb.unam.mx>
Date: Tue, 2 Jun 2026 16:05:17 -0600
Subject: [PATCH 12/13] Corrige nombre estatus

---
 reference/api/v1/views.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/reference/api/v1/views.py b/reference/api/v1/views.py
index 20334a6..82511df 100755
--- a/reference/api/v1/views.py
+++ b/reference/api/v1/views.py
@@ -40,7 +40,7 @@ def api_reference(self, request):
             except Reference.DoesNotExist:
                 new_reference = Reference.objects.create(
                     mixed_citation=post_reference,
-                    status=ReferenceStatus.CREATING,
+                    estatus=ReferenceStatus.CREATING,
                     creator=self.request.user,
                 )
 

From 3125e2599f1d06a4217753ed78851bacc7697cf6 Mon Sep 17 00:00:00 2001
From: Edgar <eduranm@dgb.unam.mx>
Date: Tue, 2 Jun 2026 16:05:48 -0600
Subject: [PATCH 13/13] =?UTF-8?q?Agrega=20identificaci=C3=B3n=20de=20p?=
 =?UTF-8?q?=C3=A1rrafo=20o=20referencia?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 reference/config.py | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/reference/config.py b/reference/config.py
index 7c385e0..a6e726d 100644
--- a/reference/config.py
+++ b/reference/config.py
@@ -9,6 +9,7 @@
         },
         {   'role': 'assistant',
             'content': json.dumps({
+                    'is_reference': 'true',
                     'reftype': 'journal',
                     'authors':  [
                         {   'surname': 'Bachman', 'fname': 'S.' },
@@ -33,6 +34,7 @@
         },
         {   'role': 'assistant',
             'content': json.dumps({
+                    'is_reference': 'true',
                     'reftype': 'Thesis',
                     'authors':  [
                         {   'surname': 'Brunel', 'fname': 'J. F.' },
@@ -52,7 +54,8 @@
             'content': 'Hernández-López, L. 1995. The endemic flora of Jalisco, Mexico: Centers of endemism and implications for conservation. Tesis de maestría. Universidad de Wisconsin. Madison, USA. 74 pp.'
         },
         {   'role': 'assistant',
-            'content': json.dumps({
+            'content': json.dumps({                
+                    'is_reference': 'true',
                     'reftype': 'Thesis',
                     'authors':  [
                         {   'surname': 'Hernández-López', 'fname': 'L.' },
@@ -73,6 +76,7 @@
         {
             'role': 'assistant',
             'content': json.dumps({
+                'is_reference': 'true',
                 'reftype': 'book',
                 'authors':[
                     {   'surname': 'Schimper', 'fname': 'A. F. W.' },
@@ -92,6 +96,7 @@
         {
             'role': 'assistant',
             'content': json.dumps({
+                'is_reference': 'true',
                 'reftype': 'book',
                 'authors':[
                     {   'surname': 'Correa', 'fname': 'M. D.' },
@@ -113,6 +118,7 @@
         {
             'role': 'assistant',
             'content': json.dumps({
+                'is_reference': 'true',
                 'reftype': 'data',
                 'authors':[
                     {   'surname': 'Hernández-López', 'fname': 'L.' },
@@ -152,6 +158,7 @@
         {
             'role': 'assistant',
             'content': json.dumps({
+                'is_reference': 'true',
                 'reftype': 'software',
                 'authors':[
                     {   'collab': 'Nikon Corporation' },
@@ -171,6 +178,7 @@
         {
             'role': 'assistant',
             'content': json.dumps({
+                    'is_reference': 'true',
                     'reftype': 'confproc',
                     'full_text': 'Furton EJ, Dort V, editors. Addiction and compulsive behaviors. Proceedings of the 17th Workshop for Bishops; 1999; Dallas, TX. Boston: National Catholic Bioethics Center (US); 2000. 258 p.',
                     'authors':[
@@ -187,6 +195,28 @@
                     'pages': '258 p'
             })
         },
+        {
+            'role': 'user',
+            'content': 'Caption Figures'
+        },
+        {
+            'role': 'assistant',
+            'content': json.dumps({
+                    'full_text': 'Caption Figures',
+                    'is_reference': 'false'
+                }),
+        },
+        {
+            'role': 'user',
+            'content': 'Author\'s Address.'
+        },
+        {
+            'role': 'assistant',
+            'content': json.dumps({
+                    'full_text': 'Author\'s Address.',
+                    'is_reference': 'false'
+                }),
+        }
         ]
 
 RESPONSE_FORMAT = {
@@ -194,6 +224,7 @@
         'schema':{
             'type': 'object',
             'properties': {
+                'is_reference': {'type': 'boolean'},
                 'reftype': {'type': 'string', 'enum': ['journal', 'thesis', 'book', 'data', 'webpage', 'software', 'confproc']},
                 'authors': {'type': 'array',
                             'items': {
@@ -205,7 +236,7 @@
                                         }
                                 }
                             },
-                "full_text": {"type": "integer"},
+                "full_text": {"type": "string"},
                 "date": {"type": "integer"},
                 "title": {"type": "string"},
                 "chapter": {"type": "string"},