diff --git a/markup_doc/labeling_utils.py b/markup_doc/labeling_utils.py index af790ff..3331dda 100644 --- a/markup_doc/labeling_utils.py +++ b/markup_doc/labeling_utils.py @@ -2,6 +2,8 @@ import html import json import re +import requests +import unicodedata import requests @@ -737,6 +739,8 @@ def match_section(item, sections): def match_subsection(item, sections): + if len(sections) <=2: + return None return ( {"label": "", "body": True} if ( @@ -748,41 +752,80 @@ def match_subsection(item, sections): ) +def normalize_text(text): + text = re.sub(r'<[^>]+>', '', text) # quita etiquetas + text = text.strip().lower() + text = unicodedata.normalize('NFD', text) + text = ''.join(c for c in text if unicodedata.category(c) != 'Mn') + return text + + +def comes_before_or_equal(obj1, obj2): + p1 = normalize_text(obj1.get('value', {}).get('paragraph', '')) + p2 = normalize_text(obj2.get('value', {}).get('paragraph', '')) + return p1 <= p2 + + +def is_probable_heading(text, max_chars=100, max_words=5): + if not text: + return False + + words = text.split() + + # Si es muy largo, probablemente es párrafo + if len(text) > max_chars: + return False + + if len(words) > max_words: + return False + + return True + + def create_labeled_object2(i, item, state, sections): obj = {} result = None - if match_section(item, sections): - result = match_section(item, sections) - state["label"] = result.get("label") - state["body"] = result.get("body") + raw_text = item.get('text', '').strip() + text = raw_text.lower() - if match_subsection(item, sections): - result = match_subsection(item, sections) - state["label"] = result.get("label") - state["body"] = result.get("body") + is_references_title = bool(re.fullmatch( + r"(?:referencias|references|referências)\s*[:.]?", + text + )) - if ( - state.get("body") - and re.search(r"^(refer)", item.get("text").lower()) - and match_section(item, sections) - ): - state["label"] = "" - state["body"] = False - state["back"] = True - obj["type"] = "paragraph" - obj["value"] = {"label": state["label"], "paragraph": item.get("text")} + is_heading_candidate = is_probable_heading(raw_text) - if state.get("body") and re.search( - r"^(refer[eê]nci|references?)\s*$", item.get("text").strip().lower() - ): - state["label"] = "" - state["body"] = False - state["back"] = True - result = {"label": "", "body": False, "back": True} - obj["type"] = "paragraph" - obj["value"] = {"label": state["label"], "paragraph": item.get("text")} + # Si es título de referencias, debe poder pasar aunque falle otra regla + if is_references_title: + is_heading_candidate = True + if is_heading_candidate: + section_result = match_section(item, sections) + + if section_result: + result = section_result + state['label'] = result.get('label') + state['body'] = result.get('body') + + else: + subsection_result = match_subsection(item, sections) + + if subsection_result: + result = subsection_result + state['label'] = result.get('label') + state['body'] = result.get('body') + + if state.get('body') and is_references_title: + state['label'] = '' + state['body'] = False + state['back'] = True + + obj['type'] = 'paragraph' + obj['value'] = { + 'label': state['label'], + 'paragraph': item.get('text') + } if not result: result = {"label": "

", "body": state["body"], "back": state["back"]} @@ -1291,6 +1334,7 @@ def append_fragment(node_dest, val): # - quitar saltos de línea clean = re.sub(r"(?i)", "", val) clean = clean.replace("\n", "") + clean = re.sub(r'<(?![/a-zA-Z_])', '<', clean) # normaliza entidades problemáticas clean = clean.replace(" ", " ") @@ -1385,3 +1429,22 @@ def proccess_special_content(text, data_body): ) return res + + +def split_abstract_inline(text): + if not text: + return None + + pattern = r'(?is)^\s*(?:)?\s*(abstract|resumen|resumo)\s*(?:)?\s*[:.]\s*(.+)$' + match = re.match(pattern, text) + + if not match: + return None + + abstract_title = match.group(1).strip() + abstract_text = match.group(2).strip() + + if not abstract_text: + return None + + return abstract_title, abstract_text \ No newline at end of file diff --git a/markup_doc/models.py b/markup_doc/models.py index bddbc37..67868f3 100644 --- a/markup_doc/models.py +++ b/markup_doc/models.py @@ -471,6 +471,7 @@ class ArticleDocxMarkup(CommonControlField, ClusterableModel): spsversion = models.TextField(_("Sps version"), null=True, blank=True) artdate = models.DateField(_("Artdate"), null=True, blank=True) ahpdate = models.DateField(_("Ahpdate"), null=True, blank=True) + dateiso = models.TextField(_("Dateiso"), null=True, blank=True) file_xml = models.FileField( null=True, diff --git a/markup_doc/tasks.py b/markup_doc/tasks.py index 130c16d..d52dcfc 100644 --- a/markup_doc/tasks.py +++ b/markup_doc/tasks.py @@ -27,6 +27,8 @@ process_reference, process_references, split_in_three, + create_special_content_object, + split_abstract_inline ) from markup_doc.models import MarkupXML, ProcessStatus, UploadDocx from markup_doc.sync_api import sync_issues_from_api, sync_journals_from_api @@ -206,6 +208,9 @@ def get_labels(article_id, user_id): next_item = None obj_reference = [] llm_first_block = None + obj_postreference = [] + last_obj = None + llama_model = False for i, item in enumerate(content): if next_item: @@ -214,17 +219,37 @@ def get_labels(article_id, user_id): obj = {} if item.get("type") in [ - "", - "", - "", - "", - ]: + "", + "", + "", + "" + ]: if item.get("type") == "": - if i + 1 < len(content): + inline_abstract = split_abstract_inline(item.get("text")) + + if inline_abstract: + abstract_title, abstract_text = inline_abstract + obj["type"] = "paragraph" obj["value"] = { "label": "", - "paragraph": item.get("text"), + "paragraph": abstract_title + } + stream_data.append(obj.copy()) + + obj["type"] = "paragraph_with_language" + obj["value"] = { + "label": "", + "paragraph": abstract_text, + "language": langid.classify(abstract_text)[0] or None + } + stream_data.append(obj.copy()) + + elif i + 1 < len(content): + obj["type"] = "paragraph" + obj["value"] = { + "label": "", + "paragraph": item.get("text") } stream_data.append(obj.copy()) @@ -369,14 +394,12 @@ def get_labels(article_id, user_id): stream_data_body.append(obj) continue - if item.get("text") is None or item.get("text") == "": - state["label_next"] = ( - state["label_next_reset"] if state["reset"] else state["label_next"] - ) - if state["back"]: - state["back"] = False - state["body"] = False - state["references"] = True + if item.get('text') is None or item.get('text') == '': + state['label_next'] = state['label_next_reset'] if state['reset'] else state['label_next'] + if state['back'] and num_ref > 0: + #state['back'] = False + state['body'] = False + state['references'] = True else: obj, result, state = create_labeled_object2(i, item, state, sections) @@ -403,35 +426,57 @@ def get_labels(article_id, user_id): stream_data.append(obj) else: stream_data_body.append(obj) - elif state["back"]: - if state["label"] == "": + elif state['back']: + if state['label'] == '': stream_data_back.append(obj) - if state["label"] == "

": + if state['label'] == '

': num_ref = num_ref + 1 - # obj = {}#process_reference(num_ref, obj, user_id) - obj_reference.append( - { - "num_ref": num_ref, - "obj": obj, - "text": obj["value"]["paragraph"], - } - ) - # stream_data_back.append(obj) + #obj = {}#process_reference(num_ref, obj, user_id) + obj_reference.append({"num_ref": num_ref, "obj": obj, "text": obj['value']['paragraph'],}) + #stream_data_back.append(obj) else: stream_data.append(obj) - num_refs = [item["num_ref"] for item in obj_reference] - if get_llm_model_name() == "LLAMA": for obj_ref in obj_reference: - obj = process_reference(obj_ref["num_ref"], obj_ref["obj"], user_id) - stream_data_back.append(obj) + obj = process_reference(obj_ref['num_ref'], obj_ref['obj'], user_id) + + is_reference = obj.get('is_reference', True) + + # Por si el modelo devuelve "false" como string + if isinstance(is_reference, str): + is_reference = is_reference.lower() == 'true' + + if is_reference: + # Opcional: quitar is_reference si no lo necesitas en el StreamField + obj.pop('is_reference', None) + stream_data_back.append(obj) + + else: + full_text = ( + obj.get('full_text') + or obj_ref.get('text') + or obj_ref.get('obj', {}).get('text') + or '' + ) + + obj_no_reference = { + 'type': 'paragraph', + 'value': { + 'label': '

', + 'paragraph': full_text + } + } + + obj_postreference.append(obj_no_reference) else: if llm_first_block is None: llm_first_block = LlamaService(mode="prompt", temperature=0.1) chunks = split_in_three(obj_reference) - output = [] + + output_reference = [] + num_refs_reference = [] logger.info( "get_labels: processando %d referências com Gemini (%d chunks)", len(obj_reference), @@ -440,21 +485,54 @@ def get_labels(article_id, user_id): for chunk in chunks: if len(chunk) > 0: - text_references = ( - "\n".join([item["text"] for item in chunk]) - .replace("", "") - .replace("", "") - ) + text_references = "\n".join( + [item["text"] for item in chunk] + ).replace('', '').replace('', '') + prompt_reference = create_prompt_reference(text_references) result = llm_first_block.run(prompt_reference) - match = re.search(r"\[.*\]", result, re.DOTALL) + match = re.search(r'\[.*\]', result, re.DOTALL) + if match: parsed = json.loads(match.group(0)) - output.extend(parsed) # Agrega a la lista de salida - stream_data_back.extend(process_references(num_refs, output)) + for index, item_response in enumerate(parsed): + if index >= len(chunk): + continue + + original_item = chunk[index] + + is_reference = item_response.get('is_reference', True) + + # Por si el modelo regresa "false" como texto + if isinstance(is_reference, str): + is_reference = is_reference.lower() == 'true' + + if is_reference: + num_refs_reference.append(original_item["num_ref"]) + output_reference.append(item_response) + + else: + full_text = ( + item_response.get('full_text') + or original_item.get('text') + or '' + ) + + obj_no_reference = { + 'type': 'paragraph', + 'value': { + 'label': '

', + 'paragraph': full_text + } + } + + obj_postreference.append(obj_no_reference) + + stream_data_back.extend(process_references(num_refs_reference, output_reference)) + stream_data_back.extend(obj_postreference) # data_front is never iterated inside get_xml — rescue any

items that the # state machine left in stream_data (body paragraphs misclassified as front diff --git a/markup_doc/xml.py b/markup_doc/xml.py index 64c05ab..11e78aa 100644 --- a/markup_doc/xml.py +++ b/markup_doc/xml.py @@ -688,9 +688,9 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None): node_table_text = d["value"]["content"] # Quitar saltos de línea y espacios extra - node_table_text = re.sub(r"\s*\n\s*", "", node_table_text).replace( - "
", "" - ) + node_table_text = re.sub(r"\s*\n\s*", "", node_table_text).replace("
","") + node_table_text = re.sub(r"<(?![/a-zA-Z_])", "<", node_table_text) + node_table_text = node_table_text.replace(" ", " ") node_table_text = re.sub(r"&(?!\w+;|#\d+;)", "&", node_table_text) tabla_element = parse_xml_fragment(node_table_text) @@ -861,15 +861,18 @@ def get_xml(article_docx, data_front, data, data_back, xref_map=None): node_p.append(child) for i, d in enumerate(data_back): - if d["value"]["label"] == "": - node_tit = etree.SubElement(node_reflist, "title") - append_fragment(node_tit, d["value"]["paragraph"]) - if d["value"]["label"] == "

": - values = d["value"] - refid = values.get("refid") or f"B{i + 1}" - node_ref = etree.SubElement(node_reflist, "ref", attrib={"id": refid}) - node_mix = etree.SubElement(node_ref, "mixed-citation") - append_fragment(node_mix, values["paragraph"]) + if d['value']['label'] == '': + node_tit = etree.SubElement(node_reflist, 'title') + append_fragment(node_tit, d['value']['paragraph']) + if d['value']['label'] == '

': + if 'refid' not in d['value']: + continue + values = d['value'] + node_ref = etree.SubElement(node_reflist, 'ref', attrib={"id": values['refid']}) + #node_label = etree.SubElement(node_ref, 'label') + #append_fragment(node_label, values['refid'].replace('B', '')) + node_mix = etree.SubElement(node_ref, 'mixed-citation') + append_fragment(node_mix, values['paragraph']) if values.get("reftype") == "journal": node_elem = etree.SubElement( diff --git a/markuplib/function_docx.py b/markuplib/function_docx.py index c9646cd..33a8162 100644 --- a/markuplib/function_docx.py +++ b/markuplib/function_docx.py @@ -175,6 +175,29 @@ def match_paragraph(text): return "" return False + def is_intro_heading(text): + if not text: + return False + + text = clean_labels(text) + text = re.sub(r'<\/?italic>', '', text, flags=re.I) + text = text.strip().lower() + + # Quita numeración tipo: + # 1. Introducción + # 1 Introducción + # 1.1 Introduction + text = re.sub(r'^\s*\d+(?:\.\d+)*\.?\s+', '', text) + + # Quita dos puntos o punto final + text = re.sub(r'[:.]$', '', text).strip() + + return text in [ + 'introducción', + 'introduction', + 'introdução' + ] + def matches_section(a, b): try: return ( @@ -191,22 +214,22 @@ def section_priority(sections): def identify_section(sections, size, bold, text): if size == 0: - return sections + return sections, None isupper = text.isupper() - s_id = {"size": size, "bold": bold, "isupper": isupper, "count": 0} + s_id = {"size": size, "bold": bold, "isupper": isupper, "count": 1} if len(sections) == 0: sections.append(s_id) - return sections + return sections, s_id for section in sections: if matches_section(s_id, section): - section["count"] += 1 - return sections + section['count'] += 1 + return sections, section sections.append(s_id) - return sections + return sections, s_id def clean_labels(text): # Eliminar etiquetas cuadradas tipo [ ... ] con espacios opcionales @@ -327,6 +350,7 @@ def extrae_Tabla(element, rels_map, namespaces): images = [] found_fb = False review_fb = True + intro_section = None # Palabras a buscar como indicador del primer bloque start_text = ["introducción", "introduction", "introdução"] @@ -337,13 +361,9 @@ def extrae_Tabla(element, rels_map, namespaces): for element in doc.element.body: is_numPr = False - if isinstance(element, CT_P): - obj = {} - paragraph = element - text_paragraph = [] - _ns_w = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} - is_numPr = paragraph.find('.//w:numPr', namespaces=_ns_w) is not None + obj = {} + if isinstance(element, CT_P): namespaces = { "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", "a": "http://schemas.openxmlformats.org/drawingml/2006/main", @@ -375,16 +395,29 @@ def extrae_Tabla(element, rels_map, namespaces): if image_name not in images: images.append(image_name) - # Guardar la imagen en Wagtail wagtail_image = ImageModel.objects.create( title=image_name, file=ContentFile(image_data, name=image_name), ) - # Referenciar la imagen guardada en el objeto obj["type"] = "image" obj["image"] = wagtail_image.id + # Si el párrafo contiene imagen, no debe depender de is_numPr + if obj_image: + if len(current_list) > 0: + current_list.append("[/list]") + objl = {} + objl["type"] = "list" + objl["list"] = "\n".join(current_list) + current_list = [] + content.append(objl) + + if obj.get("type") == "image": + content.append(obj) + + continue + ns_math = { "m": "http://schemas.openxmlformats.org/officeDocument/2006/math", "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", @@ -400,45 +433,35 @@ def extrae_Tabla(element, rels_map, namespaces): mathml_root, pretty_print=True, encoding="unicode" ) - if not obj_image: - paragraph = element - text_paragraph = [] + # Si el párrafo contiene fórmula de bloque, tampoco debe depender de is_numPr + if obj_formula: + if len(current_list) > 0: + current_list.append("[/list]") + objl = {} + objl["type"] = "list" + objl["list"] = "\n".join(current_list) + current_list = [] + content.append(objl) - # Determina si es parte de una lista - is_numPr = ( - paragraph.find(".//w:numPr", namespaces=paragraph.nsmap) - is not None - ) + content.append(obj) + continue + + paragraph = element + text_paragraph = [] + + # Determina si es parte de una lista + is_numPr = paragraph.find(".//w:numPr", namespaces=paragraph.nsmap) is not None + + # obtiene id y nivel + if is_numPr: + numPr = paragraph.find(".//w:numPr", namespaces=paragraph.nsmap) + numId = numPr.find(".//w:numId", namespaces=paragraph.nsmap).get(namespaces_p + "val") + type_matches = [(key, objt) for key, objt in list_types.items() if objt["numId"] == numId] + + # Es una lista diferente + if numId != current_num_id: + current_num_id = numId - # obtiene id y nivel - if is_numPr: - numPr = paragraph.find(".//w:numPr", namespaces=paragraph.nsmap) - numId = numPr.find( - ".//w:numId", namespaces=paragraph.nsmap - ).get(namespaces_p + "val") - type = [ - (key, objt) - for key, objt in list_types.items() - if objt["numId"] == numId - ] - - # Es una lista diferente - if numId != current_num_id: - current_num_id = numId - if len(current_list) > 0: - current_list.append("[/list]") - objl = {} - objl["type"] = "list" - objl["list"] = "\n".join(current_list) - current_list = [] - content.append(objl) - list_type = "bullet" - if type[0][1][str(0)] == "decimal": - list_type = "order" - - current_list.append(f'[list list-type="{list_type}"]') - else: - # Se terminaron de agregar elementos a la lista if len(current_list) > 0: current_list.append("[/list]") objl = {} @@ -447,252 +470,278 @@ def extrae_Tabla(element, rels_map, namespaces): current_list = [] content.append(objl) - for child in paragraph: - if ( - child.tag - == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}hyperlink" - ): - for r in child.findall("w:r", namespaces=child.nsmap): - t_elem = r.find("w:t", namespaces=child.nsmap) - if t_elem is not None and t_elem.text: - text_paragraph.append(t_elem.text) - - elif ( - child.tag - == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r" - ): - namespaces = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" - sz_element = child.find(".//w:sz", namespaces=child.nsmap) - obj["font_size"] = 0 - - if sz_element is None: - p_pr = paragraph.find( - ".//w:rPr/w:sz", namespaces=child.nsmap - ) - if p_pr is not None: - sz_element = p_pr.find( - ".//w:pPr", namespaces=child.nsmap - ) - - if sz_element is not None: - xml_string = etree.tostring( - sz_element, pretty_print=True, encoding="unicode" - ) - size_element = objectify.fromstring(xml_string) - font_size_value = size_element.get(namespaces + "val") - obj["font_size"] = int(font_size_value) / 2 + list_type = "bullet" - color_element = child.find( - ".//w:color", namespaces=child.nsmap - ) + if type_matches[0][1][str(0)] == "decimal": + list_type = "order" - if color_element is None: - p_pr = paragraph.find( - ".//w:pPr", namespaces=child.nsmap - ) - if p_pr is not None: - color_element = p_pr.find( - ".//w:rPr/w:color", namespaces=child.nsmap - ) - - if color_element is not None: - xml_string_color = etree.tostring( - color_element, pretty_print=True, encoding="unicode" - ) - object_element = objectify.fromstring(xml_string_color) - color_value = object_element.get(namespaces + "val") - obj["color"] = color_value + current_list.append(f'[list list-type="{list_type}"]') - b_tag = child.find(".//w:b", namespaces=child.nsmap) + else: + # Se terminaron de agregar elementos a la lista + if len(current_list) > 0: + current_list.append("[/list]") + objl = {} + objl["type"] = "list" + objl["list"] = "\n".join(current_list) + current_list = [] + content.append(objl) - if b_tag is None: - p_pr = paragraph.find( - ".//w:rPr/w:b", namespaces=child.nsmap - ) - if p_pr is not None: - b_tag = p_pr.find( - ".//w:pPr", namespaces=child.nsmap - ) - - if b_tag is not None: - val = b_tag.get( - "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val" - ) - obj["bold"] = val is None or val in [ - "1", - "true", - "True", - ] - else: - obj["bold"] = False - - i_tag = child.find(".//w:i", namespaces=child.nsmap) - - if i_tag is None: - p_pr = paragraph.find( - ".//w:rPr/w:i", namespaces=child.nsmap - ) - if p_pr is not None: - i_tag = p_pr.find( - ".//w:pPr", namespaces=child.nsmap - ) - - if i_tag is not None: - val = i_tag.get( - "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val" - ) - obj["italic"] = val is None or val in [ - "1", - "true", - "True", - ] - else: - obj["italic"] = False - - s_tag = child.find(".//w:spacing", namespaces=child.nsmap) - - if s_tag is None: - p_pr = paragraph.find( - ".//w:rPr/w:spacing", namespaces=child.nsmap - ) - if p_pr is not None: - s_tag = p_pr.find( - ".//w:pPr", namespaces=child.nsmap - ) - - if s_tag is not None: - val = s_tag.get( - "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}before" - ) - obj["spacing"] = not (val is None) - else: - obj["spacing"] = False + for child in paragraph: + if child.tag == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}hyperlink": + for r in child.findall("w:r", namespaces=child.nsmap): + t_elem = r.find("w:t", namespaces=child.nsmap) + + if t_elem is not None and t_elem.text: + text_paragraph.append(t_elem.text) + + elif child.tag == "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}r": + namespaces = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" + sz_element = child.find(".//w:sz", namespaces=child.nsmap) + obj["font_size"] = 12 + + if sz_element is None: + p_pr = paragraph.find(".//w:rPr/w:sz", namespaces=child.nsmap) + + if p_pr is not None: + sz_element = p_pr.find(".//w:pPr", namespaces=child.nsmap) + + if sz_element is not None: + xml_string = etree.tostring(sz_element, pretty_print=True, encoding="unicode") + size_element = objectify.fromstring(xml_string) + font_size_value = size_element.get(namespaces + "val") + obj["font_size"] = int(font_size_value) / 2 - clean_text = clean_labels(child.text) + color_element = child.find(".//w:color", namespaces=child.nsmap) + + if color_element is None: + p_pr = paragraph.find(".//w:pPr", namespaces=child.nsmap) + + if p_pr is not None: + color_element = p_pr.find(".//w:rPr/w:color", namespaces=child.nsmap) + + if color_element is not None: + xml_string_color = etree.tostring(color_element, pretty_print=True, encoding="unicode") + object_element = objectify.fromstring(xml_string_color) + color_value = object_element.get(namespaces + "val") + obj["color"] = color_value + + b_tag = child.find(".//w:b", namespaces=child.nsmap) + + if b_tag is None: + p_pr = paragraph.find(".//w:rPr/w:b", namespaces=child.nsmap) + + if p_pr is not None: + b_tag = p_pr.find(".//w:pPr", namespaces=child.nsmap) + + if b_tag is not None: + val = b_tag.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') + obj["bold"] = (val is None or val in ["1", "true", "True"]) + else: + obj["bold"] = False + + i_tag = child.find(".//w:i", namespaces=child.nsmap) + + if i_tag is None: + p_pr = paragraph.find(".//w:rPr/w:i", namespaces=child.nsmap) + + if p_pr is not None: + i_tag = p_pr.find(".//w:pPr", namespaces=child.nsmap) + + if i_tag is not None: + val = i_tag.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val") + obj["italic"] = (val is None or val in ["1", "true", "True"]) + else: + obj["italic"] = False - # identifica sección - sections = identify_section( - sections, obj["font_size"], obj["bold"], clean_text + s_tag = child.find(".//w:spacing", namespaces=child.nsmap) + + if s_tag is None: + p_pr = paragraph.find(".//w:rPr/w:spacing", namespaces=child.nsmap) + + if p_pr is not None: + s_tag = p_pr.find(".//w:pPr", namespaces=child.nsmap) + + if s_tag is not None: + val = s_tag.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}before") + obj["spacing"] = not (val is None) + else: + obj["spacing"] = False + + clean_text = clean_labels(child.text or "") + + # Identifica sección + sections, current_section = identify_section( + sections, + obj["font_size"], + obj["bold"], + clean_text + ) + + if intro_section is None and current_section is not None and is_intro_heading(clean_text): + intro_section = current_section.copy() + + if obj["italic"]: + text_paragraph.append( + "" + clean_text + "" + (f" {hiperlinks}" if hiperlinks else "") + ) + else: + text_paragraph.append( + clean_text + (f" {hiperlinks}" if hiperlinks else "") ) - if obj["italic"]: - text_paragraph.append( - "" - + clean_text - + "" - + (f" {hiperlinks}" if hiperlinks else "") - ) - else: - text_paragraph.append( - clean_text - + (f" {hiperlinks}" if hiperlinks else "") - ) + paraph = match_paragraph(clean_text) - paraph = match_paragraph(clean_text) - if paraph: - obj["paraph"] = paraph - obj["type"] = paraph + if paraph: + obj["paraph"] = paraph + obj["type"] = paraph - if review_fb: - found_fb = any( - word in clean_text.lower() for word in start_text - ) + if review_fb: + found_fb = any(word in clean_text.lower() for word in start_text) - # Si se encontró alguna palabra, incluye todo lo anterior en un sólo bloque - if found_fb: - found_fb = False - review_fb = False - found_hiperlinks = False - sections = [sections[-1]] - first_block = "" - tmp_content = [] - abstract_mode = False - - for c in content: - if abstract_mode: - if c["text"] == "" or c["spacing"] is True: - abstract_mode = False + # Si se encontró alguna palabra, incluye todo lo anterior en un solo bloque + if found_fb: + found_fb = False + review_fb = False + found_hiperlinks = False + + first_block = "" + tmp_content = [] + abstract_mode = False + abstract_started = False + + for c in content: + if abstract_mode: + if not abstract_started: + if c.get("text") == "" or c.get("spacing") is True: + continue else: + abstract_started = True tmp_content.append(c) continue - if "paraph" in c: - tmp_content.append(c) + # Ya empezó el abstract: aquí sí un vacío marca fin + if c.get("text") == "" or c.get("spacing") is True: abstract_mode = False - if c["paraph"] == "": - abstract_mode = True - continue + abstract_started = False + continue else: - if "text" in c: - first_block = first_block + "\n" + c["text"] - if "table" in c: - first_block = ( - first_block + "\n" + c["table"] - ) - - obj_b = {} - obj_b["type"] = "first_block" - obj_b["text"] = first_block - tmp_content.append(obj_b) - content = tmp_content - start_text = [] - - if child.tag == f"{{{ns_math['m']}}}oMath": - if "text" not in obj or not isinstance(obj["text"], list): - obj["type"] = "compound" - obj["text"] = [] - if len(text_paragraph) > 0: - obj2 = {} - obj2["type"] = "text" - obj2["value"] = " ".join(text_paragraph) - obj["text"].append(obj2) - text_paragraph = [] - - mathml_result = transform(child) - mathml_root = etree.fromstring(str(mathml_result)) - self.replace_mfenced_pipe_only(mathml_root) + tmp_content.append(c) + continue + + if "paraph" in c: + tmp_content.append(c) + abstract_mode = False + abstract_started = False + + if c["paraph"] == "": + abstract_mode = True + abstract_started = False + continue + + else: + if "text" in c: + first_block = first_block + "\n" + c["text"] + + if "table" in c: + first_block = first_block + "\n" + c["table"] + + obj_b = {} + obj_b["type"] = "first_block" + obj_b["text"] = first_block + tmp_content.append(obj_b) + content = tmp_content + start_text = [] + + if child.tag == f"{{{ns_math['m']}}}oMath": + if "text" not in obj or not isinstance(obj["text"], list): + obj["type"] = "compound" + obj["text"] = [] + + if len(text_paragraph) > 0: obj2 = {} - obj2["type"] = "formula" - obj2["value"] = etree.tostring( - mathml_root, pretty_print=True, encoding="unicode" - ) + obj2["type"] = "text" + obj2["value"] = " ".join(text_paragraph) obj["text"].append(obj2) + text_paragraph = [] - if "text" not in obj: - obj["text"] = (" ".join(text_paragraph)).strip() - clean_text = clean_labels(obj["text"]) - obj["text"] = clean_text + mathml_result = transform(child) + mathml_root = etree.fromstring(str(mathml_result)) + self.replace_mfenced_pipe_only(mathml_root) - paraph = match_paragraph(obj["text"]) - if paraph: - obj["paraph"] = paraph - obj["type"] = paraph - - if is_numPr: - if "font_size" in obj: - del obj["font_size"] - current_list.append(f'[list-item]{obj["text"]}[/list-item]') - if isinstance(obj["text"], list) and len(text_paragraph) > 0: obj2 = {} - obj2["type"] = "text" - obj2["value"] = " ".join(text_paragraph) + obj2["type"] = "formula" + obj2["value"] = etree.tostring(mathml_root, pretty_print=True, encoding="unicode") obj["text"].append(obj2) - text_paragraph = [] + + if "text" not in obj: + obj["text"] = (" ".join(text_paragraph)).strip() + clean_text = clean_labels(obj["text"]) + obj["text"] = clean_text + + paraph = match_paragraph(obj["text"]) + + if paraph: + obj["paraph"] = paraph + obj["type"] = paraph + + if is_numPr: + if "font_size" in obj: + del obj["font_size"] + + current_list.append(f'[list-item]{obj["text"]}[/list-item]') + + if isinstance(obj.get("text"), list) and len(text_paragraph) > 0: + obj2 = {} + obj2["type"] = "text" + obj2["value"] = " ".join(text_paragraph) + obj["text"].append(obj2) + text_paragraph = [] + + # Solo los párrafos que NO son lista se agregan directamente + if not is_numPr: + content.append(obj) elif isinstance(element, CT_Tbl): + # Si una tabla viene después de una lista, primero se cierra la lista + if len(current_list) > 0: + current_list.append("[/list]") + objl = {} + objl["type"] = "list" + objl["list"] = "\n".join(current_list) + current_list = [] + content.append(objl) + namespaces = { "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", "a": "http://schemas.openxmlformats.org/drawingml/2006/main", "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships", } - table = element table_data = extrae_Tabla(element, hiperlinks_info, namespaces) + obj = {} obj["type"] = "table" obj["table"] = table_data - if not is_numPr: + # Las tablas no dependen de is_numPr content.append(obj) + + # Si el documento termina con una lista, se cierra aquí + if len(current_list) > 0: + current_list.append("[/list]") + objl = {} + objl["type"] = "list" + objl["list"] = "\n".join(current_list) + current_list = [] + content.append(objl) + sections.sort(key=section_priority) - return sections, content + + if intro_section is not None: + for index, section in enumerate(sections): + if matches_section(section, intro_section): + sections = sections[index:] + break + + return sections, content \ No newline at end of file diff --git a/model_ai/messages.py b/model_ai/messages.py index b21c06d..7863780 100644 --- a/model_ai/messages.py +++ b/model_ai/messages.py @@ -340,7 +340,7 @@ 'state': {'type': 'string'}, 'code_country': {'type': 'string'}, 'name_country': {'type': 'string'}, - 'text_aff': {'type': 'text'} + 'text_aff': {'type': 'string'} }, "required": [ "aff", "char", "orgname", "orgdiv1", "orgdiv2", @@ -356,7 +356,10 @@ REFERENCE_MESSAGES = [ { 'role': 'system', - 'content': 'You are an assistant who distinguishes the metadata of a bibliographic reference and returns it in JSON format.' + 'content': """You are an assistant who distinguishes the metadata of a bibliographic reference and returns it in JSON format. + First determine whether the paragraph is a bibliographic reference. + If the paragraph is not a bibliographic reference, do not analyze its metadata. Return only full_text and is_reference with value false. + """ }, { 'role': 'user', 'content': """ @@ -365,6 +368,9 @@ }, { 'role': 'assistant', 'content': json.dumps({ + 'full_text': 'Smith, J. (2020). Understanding AI. Journal of Technology, 15(3), 45-60. https://doi.org/10.1234/jtech.2020.015', + 'is_reference': 'true', + 'reftype': 'journal', 'authors': [ { "name": "J.", @@ -383,6 +389,17 @@ 'doi': "10.1234/jtech.2020.015" }) }, + { 'role': 'user', + 'content': """ + Figures Captions + """ + }, + { 'role': 'assistant', + 'content': json.dumps({ + 'full_text': 'Caption Figures', + 'is_reference': 'false' + }) + } ] REFERENCE_RESPONSE_FORMAT = { @@ -390,6 +407,9 @@ 'schema':{ 'type': 'object', 'properties': { + 'full_text': {'type': 'string'}, + 'is_reference': {'type': 'boolean'}, + 'reftype': {'type': 'string', 'enum': ['journal', 'thesis', 'book', 'data', 'webpage', 'software', 'confproc']}, 'authors': {'type': 'array', 'items': { 'type': 'object', diff --git a/reference/api/v1/views.py b/reference/api/v1/views.py index 20334a6..82511df 100755 --- a/reference/api/v1/views.py +++ b/reference/api/v1/views.py @@ -40,7 +40,7 @@ def api_reference(self, request): except Reference.DoesNotExist: new_reference = Reference.objects.create( mixed_citation=post_reference, - status=ReferenceStatus.CREATING, + estatus=ReferenceStatus.CREATING, creator=self.request.user, ) diff --git a/reference/config.py b/reference/config.py index 7c385e0..a6e726d 100644 --- a/reference/config.py +++ b/reference/config.py @@ -9,6 +9,7 @@ }, { 'role': 'assistant', 'content': json.dumps({ + 'is_reference': 'true', 'reftype': 'journal', 'authors': [ { 'surname': 'Bachman', 'fname': 'S.' }, @@ -33,6 +34,7 @@ }, { 'role': 'assistant', 'content': json.dumps({ + 'is_reference': 'true', 'reftype': 'Thesis', 'authors': [ { 'surname': 'Brunel', 'fname': 'J. F.' }, @@ -52,7 +54,8 @@ 'content': 'Hernández-López, L. 1995. The endemic flora of Jalisco, Mexico: Centers of endemism and implications for conservation. Tesis de maestría. Universidad de Wisconsin. Madison, USA. 74 pp.' }, { 'role': 'assistant', - 'content': json.dumps({ + 'content': json.dumps({ + 'is_reference': 'true', 'reftype': 'Thesis', 'authors': [ { 'surname': 'Hernández-López', 'fname': 'L.' }, @@ -73,6 +76,7 @@ { 'role': 'assistant', 'content': json.dumps({ + 'is_reference': 'true', 'reftype': 'book', 'authors':[ { 'surname': 'Schimper', 'fname': 'A. F. W.' }, @@ -92,6 +96,7 @@ { 'role': 'assistant', 'content': json.dumps({ + 'is_reference': 'true', 'reftype': 'book', 'authors':[ { 'surname': 'Correa', 'fname': 'M. D.' }, @@ -113,6 +118,7 @@ { 'role': 'assistant', 'content': json.dumps({ + 'is_reference': 'true', 'reftype': 'data', 'authors':[ { 'surname': 'Hernández-López', 'fname': 'L.' }, @@ -152,6 +158,7 @@ { 'role': 'assistant', 'content': json.dumps({ + 'is_reference': 'true', 'reftype': 'software', 'authors':[ { 'collab': 'Nikon Corporation' }, @@ -171,6 +178,7 @@ { 'role': 'assistant', 'content': json.dumps({ + 'is_reference': 'true', 'reftype': 'confproc', 'full_text': 'Furton EJ, Dort V, editors. Addiction and compulsive behaviors. Proceedings of the 17th Workshop for Bishops; 1999; Dallas, TX. Boston: National Catholic Bioethics Center (US); 2000. 258 p.', 'authors':[ @@ -187,6 +195,28 @@ 'pages': '258 p' }) }, + { + 'role': 'user', + 'content': 'Caption Figures' + }, + { + 'role': 'assistant', + 'content': json.dumps({ + 'full_text': 'Caption Figures', + 'is_reference': 'false' + }), + }, + { + 'role': 'user', + 'content': 'Author\'s Address.' + }, + { + 'role': 'assistant', + 'content': json.dumps({ + 'full_text': 'Author\'s Address.', + 'is_reference': 'false' + }), + } ] RESPONSE_FORMAT = { @@ -194,6 +224,7 @@ 'schema':{ 'type': 'object', 'properties': { + 'is_reference': {'type': 'boolean'}, 'reftype': {'type': 'string', 'enum': ['journal', 'thesis', 'book', 'data', 'webpage', 'software', 'confproc']}, 'authors': {'type': 'array', 'items': { @@ -205,7 +236,7 @@ } } }, - "full_text": {"type": "integer"}, + "full_text": {"type": "string"}, "date": {"type": "integer"}, "title": {"type": "string"}, "chapter": {"type": "string"}, diff --git a/reference/config_gemini.py b/reference/config_gemini.py index d7fb18f..7541bf4 100644 --- a/reference/config_gemini.py +++ b/reference/config_gemini.py @@ -4,6 +4,9 @@ def create_prompt_reference(references): You are an assistant who distinguishes all the components of all citations in an article with output in JSON Rules: + - First determine whether the paragraph is a bibliographic reference. + - If the paragraph is not a bibliographic reference, do not analyze its metadata. Return only full_text and is_reference with value false. + - If the paragraph is a bibliographic reference, set is_reference to true and extract the metadata. - If a DOI is present in the citation, it must be included in the doi field, and the uri field must be None. If there is no DOI, then a valid persistent URL (e.g., from a repository or publisher) must be provided in the uri field instead. One of these fields — doi or uri — must always be populated. Never leave both empty. - For references of type journal, the field pages must not be included, even if they appear in the original citation. Instead, the page range should be provided only in the fields fpage and lpage. - Consider that in book-type references, the source field generally refers to the title of the book, so do not use the title field in this case, only source. @@ -16,6 +19,7 @@ def create_prompt_reference(references): "type": "object", "properties": {{ "full_text": {{"type": "string"}}, + "is_reference": {{"type": "boolean"}}, "reftype": {{"type": "string", "enum": ["journal", "thesis", "book", "data", "webpage", "software", "confproc"]}}, "authors": {{"type": "array", "items": {{ @@ -81,11 +85,16 @@ def create_prompt_reference(references): Furton EJ, Dort V, editors. Addiction and compulsive behaviors. Proceedings of the 17th Workshop for Bishops; 1999; Dallas, TX. Boston: National Catholic Bioethics Center (US); 2000. 258 p. + Figures Captions + + Author's Address. + Response: [ {{ "full_text": "Bachman, S., J. Moat, A. W. Hill, J. de la Torre and B. Scott. 2011. Supporting Red List threat assessments with GeoCAT: geospatial conservation assessment tool. ZooKeys 150: 117-126. DOI: https://doi.org/10.3897/zookeys.150.2109", + "is_reference": true, "reftype": "journal", "authors": [ {{ "surname": "Bachman", "fname": "S." }}, @@ -105,6 +114,7 @@ def create_prompt_reference(references): }}, {{ "full_text": "Brunel, J. F. 1987. Sur le genre Phyllanthus L. et quelques genres voisins de la Tribu des Phyllantheae Dumort. (Euphorbiaceae, Phyllantheae) en Afrique intertropicale et à Madagascar. Thèse de doctorat de l’Université L. Pasteur. Strasbourg, France. 760 pp.", + "is_reference": true, "reftype": "Thesis", "authors": [ {{ "surname": "Brunel", "fname": "J. F." }}, @@ -118,6 +128,7 @@ def create_prompt_reference(references): }}, {{ "full_text": "Hernández-López, L. 1995. The endemic flora of Jalisco, Mexico: Centers of endemism and implications for conservation. Tesis de maestría. Universidad de Wisconsin. Madison, USA. 74 pp.", + "is_reference": true, "reftype": "Thesis", "authors": [ {{ "surname": "Hernández-López", "fname": "L." }}, @@ -131,6 +142,7 @@ def create_prompt_reference(references): }}, {{ "full_text": "Jones DL. The role of physical activity on the need for revision total knee arthroplasty in individuals with osteoarthritis of the knee [dissertation]. [Pittsburgh (PA)]: University of Pittsburgh; 2001. 436 p.", + "is_reference": true, "reftype": "Thesis", "authors": [ {{ "surname": "Jones", "fname": "DL" }}, @@ -143,6 +155,7 @@ def create_prompt_reference(references): }}, {{ "full_text": "Schimper, A. F. W. 1903. Plant geography upon a physiological basis. Clarendon Press. Oxford, UK. 839 pp.", + "is_reference": true, "reftype": "book", "authors":[ {{ "surname": "Schimper", "fname": "A. F. W." }}, @@ -169,6 +182,7 @@ def create_prompt_reference(references): }}, {{ "full_text": "Hernández-López, L. 2019. Las especies endémicas de plantas en el estado de Jalisco: su distribución y conservación. Comisión Nacional para el Conocimiento y Uso de la Biodiversidad (CONABIO). Cd. Mx., México. https://doi.org/10.15468/ktvqds (consultado diciembre de 2019).", + "is_reference": true, "reftype": "data", "authors":[ {{ "surname": "Hernández-López", "fname": "L." }}, @@ -182,6 +196,7 @@ def create_prompt_reference(references): }}, {{ "full_text": "Lucas Leão; Perobelli, Fernando Salgueiro; Ribeiro, Hilton Manoel Dias, 2024, Data for: Ação Coletiva Institucional e Consórcio Públicos Intermunicipais no Brasil, DOI: 10.48331/scielodata.5Z4TMP, SciELO Data, V1, UNF:6:Neyjad4du3rFprhupCXizA== [fileUNF]. Disponível em: https://doi.org/10.48331/scielodata", + "is_reference": true, "reftype": "data", "authors":[ {{ "surname": "Leão", "fname": "Lucas" }}, @@ -198,6 +213,7 @@ def create_prompt_reference(references): }}, {{ "full_text": "INAFED. 2010. Enciclopedia de los Municipios y Delegaciones de México: Jalisco. Instituto Nacional para el Federalismo y el Desarrollo Municipal. http://www.inafed.gob.mx/ work/enciclopedia/EMM21puebla/index.html (consultado diciembre de 2018).", + "is_reference": true, "reftype": "webpage", "authors":[ {{ "collab": "INAFED" }}, @@ -210,6 +226,7 @@ def create_prompt_reference(references): }}, {{ "full_text": "COB - Comitê Olímpico Brasileiro. Desafio para o corpo. Disponível em: http://www.cob.org.br/esportes/esporte.asp?id=39. (Acesso em 10 abr 2010)", + "is_reference": true, "reftype": "webpage", "authors":[ {{ "collab": "COB -Comitê Olímpico Brasileiro" }}, @@ -221,6 +238,7 @@ def create_prompt_reference(references): }}, {{ "full_text": "Nikon Corporation. 1991-2006. NIS- Elements, version 2.33. Tokio, Japón.", + "is_reference": true, "reftype": "software", "authors":[ {{ "collab": "Nikon Corporation" }}, @@ -233,6 +251,7 @@ def create_prompt_reference(references): }}, {{ "full_text": "Hamric, Ann B.; Spross, Judith A.; Hanson, Charlene M. Advanced practice nursing: an integrative approach. 3rd ed. St. Louis (MO): Elsevier Saunders; c2005. 979 p.", + "is_reference": true, "reftype": "book", "authors":[ {{ "surname": "Hamric", "fname": "Ann B." }}, @@ -248,6 +267,7 @@ def create_prompt_reference(references): }}, {{ "full_text": "Calkins BM, Mendeloff AI. The epidemiology of idiopathic inflammatory bowel disease. In: Kirsner JB, Shorter RG, eds. Inflammatory bowel disease, 4th ed. Baltimore: Williams & Wilkins. 1995:31-68.", + "is_reference": true, "reftype": "book", "authors":[ {{ "surname": "Calkins", "fname": "BM" }}, @@ -268,6 +288,7 @@ def create_prompt_reference(references): }}, {{ "full_text": "Furton EJ, Dort V, editors. Addiction and compulsive behaviors. Proceedings of the 17th Workshop for Bishops; 1999; Dallas, TX. Boston: National Catholic Bioethics Center (US); 2000. 258 p.", + "is_reference": true, "reftype": "confproc", "authors":[ {{ "surname": "Furton", "fname": "EJ" }}, @@ -281,6 +302,14 @@ def create_prompt_reference(references): "organization": "National Catholic Bioethics Center (US)", "org_location": "Boston", "pages": "258 p" + }}, + {{ + "full_text": "Caption Figures", + "is_reference": false + }}, + {{ + "full_text": "Author's Address.", + "is_reference": false }} ]