diff --git a/cds_migrator_kit/rdm/records/transform/models/faser_drafts.py b/cds_migrator_kit/rdm/records/transform/models/faser_drafts.py new file mode 100644 index 00000000..f0de6e92 --- /dev/null +++ b/cds_migrator_kit/rdm/records/transform/models/faser_drafts.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""CDS-RDM FASER drafts model.""" + +from cds_migrator_kit.rdm.records.transform.models.base_publication_record import ( + rdm_base_publication_model, +) +from cds_migrator_kit.transform.overdo import CdsOverdo + + +class FaserDraftsModel(CdsOverdo): + """Translation model for FASER draft records.""" + + __query__ = ( + '693__:FASER AND 591__b:"Draft" AND (' + "980__.a:NOTE OR " + "(980__.a:CONFERENCEPAPER OR 980__.a:SLIDE OR 980__.a:CONFERENCENOTE) OR " + "((980__.a:ARTICLE OR 980__.a:PREPRINT) -980__.a:CONFERENCEPAPER)" + ")" + ) + + __ignore_keys__ = { + "0248_a", + "0248_p", + "0248_q", + "035__d", # oai harvest tag + "035__h", # oai harvest tag + "035__m", # oai harvest tag + "035__t", # oai harvest tag + "100__m", # email of contributor + "300__a", # number of pages + "500__9", + "520__9", + "540__3", + "542__3", + "700__m", # email of contributor + "773__p", # TODO: title of the related links: 2959848, 2949836. values: twiki, internal comments + "8564_8", + "8564_s", + "8564_x", + "8564_y", # file description - done by files dump + "916__y", # year of publication, redundant + "937__c", # last modified by + "937__s", # last modification date + "960__a", # base number + "961__c", # CDS modification tag # TODO + "961__h", # CDS modification tag # TODO + "961__l", # CDS modification tag # TODO + "961__x", # CDS modification tag # TODO + "981__a", # duplicate record id + "980__b", # additional article tag + } + + _default_fields = { + "custom_fields": {}, + # TODO: some records are missing creators, is it okay if we add a default value? + "creators": [ + {"person_or_org": {"type": "organizational", "name": "FASER Collaboration"}} + ], + } + + +faser_drafts_model = FaserDraftsModel( + bases=(rdm_base_publication_model,), + entry_point_group="cds_migrator_kit.migrator.rules.faser_drafts", +) diff --git a/cds_migrator_kit/rdm/records/transform/models/faser_publication.py b/cds_migrator_kit/rdm/records/transform/models/faser_publication.py new file mode 100644 index 00000000..77490245 --- /dev/null +++ b/cds_migrator_kit/rdm/records/transform/models/faser_publication.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""CDS-RDM FASER publication model.""" + +from cds_migrator_kit.rdm.records.transform.models.base_publication_record import ( + rdm_base_publication_model, +) +from cds_migrator_kit.transform.overdo import CdsOverdo + + +class FaserPublicationModel(CdsOverdo): + """Translation model for FASER publication records.""" + + __query__ = ( + '693__:FASER -591__b:"Draft" AND (' + "980__.a:NOTE OR " + "(980__.a:CONFERENCEPAPER OR 980__.a:SLIDE OR 980__.a:CONFERENCENOTE) OR " + "((980__.a:ARTICLE OR 980__.a:PREPRINT) -980__.a:CONFERENCEPAPER)" + ")" + ) + + __ignore_keys__ = { + "0248_a", + "0248_p", + "0248_q", + "035__d", # oai harvest tag + "035__h", # oai harvest tag + "035__m", # oai harvest tag + "035__t", # oai harvest tag + "035__u", # oai harvest tag + "037__c", # arXiv subject + "100__m", # email of contributor + "100__v", # explanation of the affiliation? TODO: can we ignore? 2816452, 2924565 + "270__m", # contact person email + "300__a", # number of pages + "500__9", + "520__9", + "540__3", + "542__3", + "700__m", # email of contributor + "700__v", # explanation of the affiliation? TODO: can we ignore? 2816452, 2924565 + "8564_8", + "8564_s", + "8564_x", + "8564_y", # file description - done by files dump + "8564_z", # TODO value:Stamped by WebSubmit. Only EP records: 2888582, 2917427, 2853245, 2766207 + "903__s", # public: 2651328, 2642351, 2702868 + "905__m", # email of spokesperson + "916__y", # year of publication, redundant + "937__c", # last modified by + "937__s", # last modification date + "960__a", # base number + "961__c", # CDS modification tag # TODO + "961__h", # CDS modification tag # TODO + "961__l", # CDS modification tag # TODO + "961__x", # CDS modification tag # TODO + "981__a", # duplicate record id + # TODO + "773__0", # TODO Check what is this field + "773__z", # TODO: check what is this field + "773__p", # TODO: title of the related links: 2959848, 2949836. values: twiki, internal comments + "773__x", # TODO: one record 2652275 + "542__h", # TODO: year is in the wrong field 2816452 + "0247_9", # TODO: 20 records has 2 and 9 subfields. + "906__a", # TODO: one record 2702868 + "906__p", + "906__m", + "595_Ds", # # TODO: some note? 2798675, 2791032 + "595_Dd", + "595_Da", + } + + _default_fields = { + "custom_fields": {}, + } + + +faser_publication_model = FaserPublicationModel( + bases=(rdm_base_publication_model,), + entry_point_group="cds_migrator_kit.migrator.rules.faser_publication", +) diff --git a/cds_migrator_kit/rdm/records/transform/transform.py b/cds_migrator_kit/rdm/records/transform/transform.py index c6bf6874..625e9157 100644 --- a/cds_migrator_kit/rdm/records/transform/transform.py +++ b/cds_migrator_kit/rdm/records/transform/transform.py @@ -518,7 +518,8 @@ def subjects(json_entry): if item in keys: keys.remove(item) - forgotten_keys = [key for key in keys if key not in list(metadata.keys())] + used_keys = list(metadata.keys()) + ["ep_approval"] + forgotten_keys = [key for key in keys if key not in used_keys] if forgotten_keys: raise ManualImportRequired("Unassigned metadata key", value=forgotten_keys) return {k: v for k, v in metadata.items() if v} diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/faser_drafts.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/faser_drafts.py new file mode 100644 index 00000000..38292eec --- /dev/null +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/faser_drafts.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""CDS-RDM FASER drafts rules.""" + +from dojson.errors import IgnoreKey +from dojson.utils import for_each_value + +from cds_migrator_kit.errors import UnexpectedValue +from cds_migrator_kit.rdm.records.transform.xml_processing.rules.base import urls + +from ...models.faser_drafts import faser_drafts_model as model + + +@model.over("collection", "^690C_", override=True) +@for_each_value +def collection(self, key, value): + """Translates collection field.""" + collection = value.get("a").strip() + if collection.lower() not in ["cern", "faser", "preprint"]: + raise UnexpectedValue(subfield="a", field=key, value=value) + subjects = self.get("subjects", []) + subjects.append( + { + "subject": f"collection:{collection}", + } + ) + self["subjects"] = subjects + raise IgnoreKey("collection") + + +@model.over("resource_type", "^980__", override=True) +def resource_type(self, key, value): + """Translates resource_type.""" + value = value.get("a") + if value: + value = value.strip().lower() + + map_ = { + "article": {"id": "publication-article"}, + "preprint": {"id": "publication-preprint"}, + "scicommpubllhcc": {"id": "publication-article"}, + "bookchapter": {"id": "publication-section"}, + "faser_papers": {"id": "publication-article"}, + "note": {"id": "publication-technicalnote"}, + "conferencepaper": {"id": "publication-conferenceproceeding"}, + "slide": {"id": "presentation"}, + # TODO: is this correct? + "conferencenote": {"id": "publication-technicalnote"}, + } + try: + return map_[value] + except KeyError: + raise UnexpectedValue("Unknown resource type (FASER)", field=key, value=value) + + +@model.over("status", "^591__") +@for_each_value +def status(self, key, value): + """Translates status field.""" + status = value.get("b").strip().lower() + # TODO: there is no other value, can we ignore? + if status != "draft": + raise UnexpectedValue(subfield="b", field=key, value=value) + raise IgnoreKey("status") + + +@model.over("restriction_access_notes", "^5061_") +@for_each_value +def restriction_access_notes(self, key, value): + """Translates status field.""" + restriction_access_notes = value.get("a", "").strip().lower() + # TODO: how to handle these values? + if restriction_access_notes and restriction_access_notes not in [ + "faser-preprint", + "faser-slide", + "faser-confnote", + "faser-confpaper", + ]: + raise UnexpectedValue(subfield="a", field=key, value=value) + raise IgnoreKey("restriction_access_notes") + + +@model.over("related_ids", "(^773__)", override=True) +@for_each_value +def related_ids(self, key, value): + """Translated related links.""" + related_link = value.get("u", "") + if not related_link: + raise UnexpectedValue(subfield="u", field=key, value=value) + # TODO: how to transform? https://cds.cern.ch/record/1452204/export/xm + # Transform like the base `urls` rule, we lose the title "p" it's okay? + rel_ids = urls(self, key, value) + if not rel_ids: + raise IgnoreKey("related_ids") + rel_id = rel_ids[0] + related_identifiers = self.get("related_identifiers", []) + if rel_id not in related_identifiers: + related_identifiers.append(rel_id) + self["related_identifiers"] = related_identifiers + + raise IgnoreKey("related_ids") diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/faser_publication.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/faser_publication.py new file mode 100644 index 00000000..3a01bfbc --- /dev/null +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/faser_publication.py @@ -0,0 +1,196 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2026 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""CDS-RDM FASER publication rules.""" + +from dojson.errors import IgnoreKey +from dojson.utils import for_each_value + +from cds_migrator_kit.errors import UnexpectedValue +from cds_migrator_kit.rdm.records.transform.xml_processing.rules.base import ( + urls, +) +from cds_migrator_kit.rdm.records.transform.xml_processing.rules.publications import ( + journal, +) +from cds_migrator_kit.transform.xml_processing.quality.decorators import require +from cds_migrator_kit.transform.xml_processing.quality.parsers import StringValue +from cds_migrator_kit.transform.xml_processing.rules.base import process_contributors + +from ...models.faser_publication import faser_publication_model as model + + +@model.over("collection", "^690C_", override=True) +@for_each_value +def collection(self, key, value): + """Translates collection field.""" + collection = value.get("a").strip() + if collection.lower() not in ["cern", "article", "preprint", "scicom", "publlhcc"]: + raise UnexpectedValue(subfield="a", field=key, value=value) + subjects = self.get("subjects", []) + subjects.append( + { + "subject": f"collection:{collection}", + } + ) + self["subjects"] = subjects + raise IgnoreKey("collection") + + +@model.over("resource_type", "^980__", override=True) +def resource_type(self, key, value): + """Translates resource_type.""" + value = value.get("a") + if value: + value = value.strip().lower() + + map_ = { + "article": {"id": "publication-article"}, + "conferencepaper": {"id": "publication-conferenceproceeding"}, + "conferencenote": {"id": "publication-technicalnote"}, + "preprint": {"id": "publication-preprint"}, + "bookchapter": {"id": "publication-section"}, + "scicommpubllhcc": {"id": "publication-article"}, + # Only exists in EP records + "faser_papers": {"id": "publication-article"}, + } + try: + return map_[value] + except KeyError: + raise UnexpectedValue("Unknown resource type (FASER)", field=key, value=value) + + +@model.over("status", "^591__") +@for_each_value +def status(self, key, value): + """Translates status field.""" + status = value.get("b").strip().lower() + # TODO: there is no other value, can we ignore? + if status != "approved": + raise UnexpectedValue(subfield="b", field=key, value=value) + raise IgnoreKey("status") + + +@model.over("restriction_access_notes", "^5061_") +@for_each_value +def restriction_access_notes(self, key, value): + """Translates status field.""" + restriction_access_notes = value.get("a", "").strip().lower() + # TODO: how to handle these values? + if restriction_access_notes and restriction_access_notes not in [ + "faser-confnote", + "faser-confpaper", + ]: + raise UnexpectedValue(subfield="a", field=key, value=value) + raise IgnoreKey("restriction_access_notes") + + +@model.over("related_ids", "(^773__)", override=True) +@for_each_value +def related_ids(self, key, value): + """Translated related links.""" + related_link = value.get("u", "") + m_value = value.get("m", "") + if m_value and m_value.strip().lower() != "publication": + raise UnexpectedValue(subfield="m", field=key, value=value) + if not related_link: + custom_fields = journal(self, key, value) + self["custom_fields"] = custom_fields + raise IgnoreKey("related_ids") + # TODO: how to transform? https://cds.cern.ch/record/1452204/export/xm + # Transform like the base `urls` rule, we lose the title "p" it's okay? + rel_ids = urls(self, key, value) + if not rel_ids: + raise IgnoreKey("related_ids") + rel_id = rel_ids[0] + related_identifiers = self.get("related_identifiers", []) + if rel_id not in related_identifiers: + related_identifiers.append(rel_id) + self["related_identifiers"] = related_identifiers + + raise IgnoreKey("related_ids") + + +@model.over("faser_contributors", "^700__", override=True) +@for_each_value +@require(["a"]) +def faser_contributors(self, key, value): + """Translates contributors.""" + orcid_subfield = "j" + if not value.get(orcid_subfield): + orcid_subfield = "k" + contributors = self.get("contributors", []) + contributors.append(process_contributors(key, value, orcid_subfield=orcid_subfield)) + self["contributors"] = contributors + raise IgnoreKey("faser_contributors") + + +@model.over("spokesperson", "^905__") +@for_each_value +def spokesperson(self, key, value): + """Translates spokesperson field into contributors.""" + email = value.get("m", "").strip().lower() + address = StringValue(value.get("a", "")).parse() + person = StringValue(value.get("p", "")).parse() + + if not person: + raise UnexpectedValue(subfield="p", field=key, value=value) + + person_or_org = {"type": "personal"} + if person: + names = person.split(",", 1) + if len(names) == 2: + person_or_org["family_name"] = names[0].strip() + person_or_org["given_name"] = names[1].strip() + else: + person_or_org["family_name"] = person + person_or_org["name"] = person + + contributor = { + "person_or_org": person_or_org, + "role": {"id": "contactperson"}, + } + if address: + contributor["affiliations"] = [address] + + contributors = self.get("contributors", []) + existing_names = {c.get("person_or_org", {}).get("name") for c in contributors} + if person_or_org["name"] not in existing_names: + contributors.append(contributor) + self["contributors"] = contributors + + raise IgnoreKey("spokesperson") + + +@model.over("ep_approval", "^9031_") +@for_each_value +def ep_approval(self, key, value): + """Translates EP approval status.""" + status = value.get("s", "").strip().lower() + submitted_by = value.get("f", "").strip().lower() + date = value.get("d", "").strip() + deadline = value.get("e", "").strip() + description = value.get("a", "").strip() + ep_report_number = value.get("b", "").strip() + stamp_info = value.get("g", "").strip() + doc_type = value.get("c", "").strip() + if status not in ["waiting", "approved"]: + raise UnexpectedValue(subfield="a", field=key, value=value) + return { + k: v + for k, v in { + "status": status, + "submitted_by": submitted_by, + "date": date, + "deadline": deadline, + "description": description, + "ep_report_number": ep_report_number, + "stamp_info": stamp_info, + "doc_type": doc_type, + }.items() + if v + } diff --git a/cds_migrator_kit/rdm/streams.yaml b/cds_migrator_kit/rdm/streams.yaml index f649456b..52f8197e 100644 --- a/cds_migrator_kit/rdm/streams.yaml +++ b/cds_migrator_kit/rdm/streams.yaml @@ -130,3 +130,64 @@ records: missing_users: cds_migrator_kit/rdm/data/users communities_ids: - "" + faser-drafts: + data_dir: cds_migrator_kit/rdm/data/faser-drafts + tmp_dir: cds_migrator_kit/rdm/tmp/faser-drafts + log_dir: cds_migrator_kit/rdm/log/faser-drafts + restricted: "True" + access_grants_view: + - faser-all + extract: + dirpath: cds_migrator_kit/rdm/data/faser-drafts/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/faser-drafts/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "33af9368-5bad-45cb-9360-8c9e5dfca09f" + faser: + data_dir: cds_migrator_kit/rdm/data/faser + tmp_dir: cds_migrator_kit/rdm/tmp/faser + log_dir: cds_migrator_kit/rdm/log/faser + extract: + dirpath: cds_migrator_kit/rdm/data/faser/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/faser/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "33af9368-5bad-45cb-9360-8c9e5dfca09f" + faser-drafts: + data_dir: cds_migrator_kit/rdm/data/faser-drafts + tmp_dir: cds_migrator_kit/rdm/tmp/faser-drafts + log_dir: cds_migrator_kit/rdm/log/faser-drafts + restricted: "True" + access_grants_view: + - faser-all + extract: + dirpath: cds_migrator_kit/rdm/data/faser-drafts/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/faser-drafts/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "33af9368-5bad-45cb-9360-8c9e5dfca09f" + faser: + data_dir: cds_migrator_kit/rdm/data/faser + tmp_dir: cds_migrator_kit/rdm/tmp/faser + log_dir: cds_migrator_kit/rdm/log/faser + extract: + dirpath: cds_migrator_kit/rdm/data/faser/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/faser/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "33af9368-5bad-45cb-9360-8c9e5dfca09f" + faser-ep: + data_dir: cds_migrator_kit/rdm/data/faser + tmp_dir: cds_migrator_kit/rdm/tmp/faser + log_dir: cds_migrator_kit/rdm/log/faser + extract: + dirpath: cds_migrator_kit/rdm/data/faser-ep/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/faser/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "33af9368-5bad-45cb-9360-8c9e5dfca09f" diff --git a/cds_migrator_kit/transform/xml_processing/rules/base.py b/cds_migrator_kit/transform/xml_processing/rules/base.py index f4de40e3..e956460c 100644 --- a/cds_migrator_kit/transform/xml_processing/rules/base.py +++ b/cds_migrator_kit/transform/xml_processing/rules/base.py @@ -111,7 +111,11 @@ def process_contributors(key, value, orcid_subfield="k"): for aff in _affiliations: if aff: aff_entry = aff.replace("ROR:", "") - affiliations.append(normalize_ror(aff_entry)) + try: + normalized_ror = normalize_ror(aff_entry) + affiliations.append(normalized_ror) + except Exception: + raise UnexpectedValue(field=key, subfield="t", value=aff_entry) else: affiliations = get_contributor_affiliations(value) diff --git a/setup.cfg b/setup.cfg index 07732861..fd0c6d54 100644 --- a/setup.cfg +++ b/setup.cfg @@ -86,6 +86,8 @@ cds_migrator_kit.migrator.models = en = cds_migrator_kit.rdm.records.transform.models.en:en_model annual_rep = cds_migrator_kit.rdm.records.transform.models.annual_report:annual_rep_model + faser_drafts = cds_migrator_kit.rdm.records.transform.models.faser_drafts:faser_drafts_model + faser_publication = cds_migrator_kit.rdm.records.transform.models.faser_publication:faser_publication_model cds_migrator_kit.migrator.rules.base = base = cds_migrator_kit.transform.xml_processing.rules.base cds_migrator_kit.migrator.rdm.rules.base = @@ -167,6 +169,16 @@ cds_migrator_kit.migrator.rules.fap = base = cds_migrator_kit.transform.xml_processing.rules.base base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base fap = cds_migrator_kit.rdm.records.transform.xml_processing.rules.fap +cds_migrator_kit.migrator.rules.faser_drafts = + base = cds_migrator_kit.transform.xml_processing.rules.base + base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base + publication = cds_migrator_kit.rdm.records.transform.xml_processing.rules.publications + faser_drafts = cds_migrator_kit.rdm.records.transform.xml_processing.rules.faser_drafts +cds_migrator_kit.migrator.rules.faser_publication = + base = cds_migrator_kit.transform.xml_processing.rules.base + base_records = cds_migrator_kit.rdm.records.transform.xml_processing.rules.base + publication = cds_migrator_kit.rdm.records.transform.xml_processing.rules.publications + faser_publication = cds_migrator_kit.rdm.records.transform.xml_processing.rules.faser_publication cds_migrator_kit.migrator.rules.people = people = cds_migrator_kit.rdm.users.transform.xml_processing.rules.people invenio_pidstore.minters =