Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions cds_migrator_kit/rdm/records/transform/models/faser_drafts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2026 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM FASER drafts model."""

from cds_migrator_kit.rdm.records.transform.models.base_publication_record import (
rdm_base_publication_model,
)
from cds_migrator_kit.transform.overdo import CdsOverdo


class FaserDraftsModel(CdsOverdo):
"""Translation model for FASER draft records."""

__query__ = (
'693__:FASER AND 591__b:"Draft" AND ('
"980__.a:NOTE OR "
"(980__.a:CONFERENCEPAPER OR 980__.a:SLIDE OR 980__.a:CONFERENCENOTE) OR "
"((980__.a:ARTICLE OR 980__.a:PREPRINT) -980__.a:CONFERENCEPAPER)"
")"
)

__ignore_keys__ = {
"0248_a",
"0248_p",
"0248_q",
"035__d", # oai harvest tag
"035__h", # oai harvest tag
"035__m", # oai harvest tag
"035__t", # oai harvest tag
"100__m", # email of contributor
"300__a", # number of pages
"500__9",
"520__9",
"540__3",
"542__3",
"700__m", # email of contributor
"773__p", # TODO: title of the related links: 2959848, 2949836. values: twiki, internal comments
"8564_8",
"8564_s",
"8564_x",
"8564_y", # file description - done by files dump
"916__y", # year of publication, redundant
"937__c", # last modified by
"937__s", # last modification date
"960__a", # base number
"961__c", # CDS modification tag # TODO
"961__h", # CDS modification tag # TODO
"961__l", # CDS modification tag # TODO
"961__x", # CDS modification tag # TODO
"981__a", # duplicate record id
"980__b", # additional article tag
}

_default_fields = {
"custom_fields": {},
# TODO: some records are missing creators, is it okay if we add a default value?
"creators": [
{"person_or_org": {"type": "organizational", "name": "FASER Collaboration"}}
],
}


faser_drafts_model = FaserDraftsModel(
bases=(rdm_base_publication_model,),
entry_point_group="cds_migrator_kit.migrator.rules.faser_drafts",
)
86 changes: 86 additions & 0 deletions cds_migrator_kit/rdm/records/transform/models/faser_publication.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2026 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM FASER publication model."""

from cds_migrator_kit.rdm.records.transform.models.base_publication_record import (
rdm_base_publication_model,
)
from cds_migrator_kit.transform.overdo import CdsOverdo


class FaserPublicationModel(CdsOverdo):
"""Translation model for FASER publication records."""

__query__ = (
'693__:FASER -591__b:"Draft" AND ('
"980__.a:NOTE OR "
"(980__.a:CONFERENCEPAPER OR 980__.a:SLIDE OR 980__.a:CONFERENCENOTE) OR "
"((980__.a:ARTICLE OR 980__.a:PREPRINT) -980__.a:CONFERENCEPAPER)"
")"
)

__ignore_keys__ = {
"0248_a",
"0248_p",
"0248_q",
"035__d", # oai harvest tag
"035__h", # oai harvest tag
"035__m", # oai harvest tag
"035__t", # oai harvest tag
"035__u", # oai harvest tag
"037__c", # arXiv subject
"100__m", # email of contributor
"100__v", # explanation of the affiliation? TODO: can we ignore? 2816452, 2924565
"270__m", # contact person email
"300__a", # number of pages
"500__9",
"520__9",
"540__3",
"542__3",
"700__m", # email of contributor
"700__v", # explanation of the affiliation? TODO: can we ignore? 2816452, 2924565
"8564_8",
"8564_s",
"8564_x",
"8564_y", # file description - done by files dump
"8564_z", # TODO value:Stamped by WebSubmit. Only EP records: 2888582, 2917427, 2853245, 2766207
"903__s", # public: 2651328, 2642351, 2702868
"905__m", # email of spokesperson
"916__y", # year of publication, redundant
"937__c", # last modified by
"937__s", # last modification date
"960__a", # base number
"961__c", # CDS modification tag # TODO
"961__h", # CDS modification tag # TODO
"961__l", # CDS modification tag # TODO
"961__x", # CDS modification tag # TODO
"981__a", # duplicate record id
# TODO
"773__0", # TODO Check what is this field
"773__z", # TODO: check what is this field
"773__p", # TODO: title of the related links: 2959848, 2949836. values: twiki, internal comments
"773__x", # TODO: one record 2652275
"542__h", # TODO: year is in the wrong field 2816452
"0247_9", # TODO: 20 records has 2 and 9 subfields.
"906__a", # TODO: one record 2702868
"906__p",
"906__m",
"595_Ds", # # TODO: some note? 2798675, 2791032
"595_Dd",
"595_Da",
}

_default_fields = {
"custom_fields": {},
}


faser_publication_model = FaserPublicationModel(
bases=(rdm_base_publication_model,),
entry_point_group="cds_migrator_kit.migrator.rules.faser_publication",
)
3 changes: 2 additions & 1 deletion cds_migrator_kit/rdm/records/transform/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,8 @@ def subjects(json_entry):
if item in keys:
keys.remove(item)

forgotten_keys = [key for key in keys if key not in list(metadata.keys())]
used_keys = list(metadata.keys()) + ["ep_approval"]
forgotten_keys = [key for key in keys if key not in used_keys]
if forgotten_keys:
raise ManualImportRequired("Unassigned metadata key", value=forgotten_keys)
return {k: v for k, v in metadata.items() if v}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2026 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM FASER drafts rules."""

from dojson.errors import IgnoreKey
from dojson.utils import for_each_value

from cds_migrator_kit.errors import UnexpectedValue
from cds_migrator_kit.rdm.records.transform.xml_processing.rules.base import urls

from ...models.faser_drafts import faser_drafts_model as model


@model.over("collection", "^690C_", override=True)
@for_each_value
def collection(self, key, value):
"""Translates collection field."""
collection = value.get("a").strip()
if collection.lower() not in ["cern", "faser", "preprint"]:
raise UnexpectedValue(subfield="a", field=key, value=value)
subjects = self.get("subjects", [])
subjects.append(
{
"subject": f"collection:{collection}",
}
)
self["subjects"] = subjects
raise IgnoreKey("collection")


@model.over("resource_type", "^980__", override=True)
def resource_type(self, key, value):
"""Translates resource_type."""
value = value.get("a")
if value:
value = value.strip().lower()

map_ = {
"article": {"id": "publication-article"},
"preprint": {"id": "publication-preprint"},
"scicommpubllhcc": {"id": "publication-article"},
"bookchapter": {"id": "publication-section"},
"faser_papers": {"id": "publication-article"},
"note": {"id": "publication-technicalnote"},
"conferencepaper": {"id": "publication-conferenceproceeding"},
"slide": {"id": "presentation"},
# TODO: is this correct?
"conferencenote": {"id": "publication-technicalnote"},
}
try:
return map_[value]
except KeyError:
raise UnexpectedValue("Unknown resource type (FASER)", field=key, value=value)


@model.over("status", "^591__")
@for_each_value
def status(self, key, value):
"""Translates status field."""
status = value.get("b").strip().lower()
# TODO: there is no other value, can we ignore?
if status != "draft":
raise UnexpectedValue(subfield="b", field=key, value=value)
raise IgnoreKey("status")


@model.over("restriction_access_notes", "^5061_")
@for_each_value
def restriction_access_notes(self, key, value):
"""Translates status field."""
restriction_access_notes = value.get("a", "").strip().lower()
# TODO: how to handle these values?
if restriction_access_notes and restriction_access_notes not in [
"faser-preprint",
"faser-slide",
"faser-confnote",
"faser-confpaper",
]:
raise UnexpectedValue(subfield="a", field=key, value=value)
raise IgnoreKey("restriction_access_notes")


@model.over("related_ids", "(^773__)", override=True)
@for_each_value
def related_ids(self, key, value):
"""Translated related links."""
related_link = value.get("u", "")
if not related_link:
raise UnexpectedValue(subfield="u", field=key, value=value)
# TODO: how to transform? https://cds.cern.ch/record/1452204/export/xm
# Transform like the base `urls` rule, we lose the title "p" it's okay?
rel_ids = urls(self, key, value)
if not rel_ids:
raise IgnoreKey("related_ids")
rel_id = rel_ids[0]
related_identifiers = self.get("related_identifiers", [])
if rel_id not in related_identifiers:
related_identifiers.append(rel_id)
self["related_identifiers"] = related_identifiers

raise IgnoreKey("related_ids")
Loading
Loading