From b33ee9f549c6e1016d01299920377fb37c011c22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Z=C3=BCbeyde=20Civelek?= Date: Fri, 5 Jun 2026 15:12:57 +0200 Subject: [PATCH] remove(custom_fields): open access level field --- cds_migrator_kit/rdm/migration_config.py | 3 +- cds_migrator_kit/rdm/records/load/load.py | 2 +- .../rdm/records/transform/transform.py | 1 - .../xml_processing/rules/publications.py | 97 +------ tests/cds-rdm/test_publications_rules.py | 236 +----------------- 5 files changed, 15 insertions(+), 324 deletions(-) diff --git a/cds_migrator_kit/rdm/migration_config.py b/cds_migrator_kit/rdm/migration_config.py index 10c44332..690f6fbf 100644 --- a/cds_migrator_kit/rdm/migration_config.py +++ b/cds_migrator_kit/rdm/migration_config.py @@ -7,11 +7,10 @@ https://inveniordm.docs.cern.ch/reference/configuration/. """ -from pathlib import Path - import json import os from datetime import datetime, timedelta +from pathlib import Path from cds_rdm import schemes from cds_rdm.clc_sync.services.components import ClcSyncComponent diff --git a/cds_migrator_kit/rdm/records/load/load.py b/cds_migrator_kit/rdm/records/load/load.py index 7763aa6a..120dc168 100644 --- a/cds_migrator_kit/rdm/records/load/load.py +++ b/cds_migrator_kit/rdm/records/load/load.py @@ -28,8 +28,8 @@ from invenio_rdm_records.proxies import current_rdm_records_service from invenio_records.systemfields.relations import InvalidRelationValue from marshmallow import ValidationError -from sqlalchemy.exc import IntegrityError from psycopg2.errors import UniqueViolation +from sqlalchemy.exc import IntegrityError from cds_migrator_kit.errors import ( CDSMigrationException, diff --git a/cds_migrator_kit/rdm/records/transform/transform.py b/cds_migrator_kit/rdm/records/transform/transform.py index 4fdad6a5..c6bf6874 100644 --- a/cds_migrator_kit/rdm/records/transform/transform.py +++ b/cds_migrator_kit/rdm/records/transform/transform.py @@ -641,7 +641,6 @@ def field_beams(record_json, custom_fields_dict): "cern:beams": [], "cern:programmes": field_programmes(json_entry), "cern:committees": _cf.get("cern:committees"), - "cern:oa_level": _cf.get("cern:oa_level"), "cern:oa_funding_model": _cf.get("cern:oa_funding_model"), "thesis:thesis": _cf.get("thesis:thesis", {}), "journal:journal": _cf.get("journal:journal", {}), diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/publications.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/publications.py index 1741ac8d..cdabd686 100644 --- a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/publications.py +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/publications.py @@ -22,16 +22,12 @@ from ...models.base_publication_record import rdm_base_publication_model as model from .base import licenses as _base_licenses from .base import normalize -from .base import note as _base_note -from .base import urls as _base_urls # Unwrapped base functions (strip @for_each_value to avoid double-wrapping). # licenses also has @filter_values beneath @for_each_value, so two levels deep. _raw_licenses = ( _base_licenses.__wrapped__ ) # filter_values(raw) — handles None filtering -_raw_note = _base_note.__wrapped__ # raw note function -_raw_urls = _base_urls.__wrapped__ # raw urls function _FUNDING_MODEL_MAP = { "scoap3": "scoap3", @@ -41,9 +37,6 @@ "other": "other", } -# Lower number = higher priority -_OA_LEVEL_PRIORITY = {"gold": 0, "bronze": 1, "green": 2, "closed": 3} - def _sub(v, code): """Return first string value of a MARC subfield, handling dojson tuple packing.""" @@ -284,48 +277,23 @@ def journal(self, key, value): @model.over("_oa_license", "^540__", override=True) def oa_level_from_license(self, key, value): - """Detect OA level and funding model; also runs base license logic for rights. + """Detect funding model; also runs base license logic for rights. - 540__a: license identifier ('CC BY', 'CC-BY' → gold if 540__3='publication') - 540__f: 'Bronze' → bronze OA level; - 'SCOAP3'|'Collective'|'CERN-RP'|'CERN-APC'|'Other' → funding model - 540__3: 'publication' required for gold; 'preprint' alone → green + 540__f: 'SCOAP3'|'Collective'|'CERN-RP'|'CERN-APC'|'Other' → funding model """ _custom_fields = self.get("custom_fields", {}) rights = self.get("rights", []) for v in force_list(value): qualifier = _sub(v, "f").strip() - scope = _sub(v, "3").strip().lower() - - # Check ALL 'a' subfields: dojson packs repeated subfields as a tuple. - a_vals = force_list(v.get("a")) or () - is_cc_by = any(a.strip().lower() in ["cc by", "cc-by"] for a in a_vals) - is_bronze = qualifier.lower() == "bronze" - is_publication_scope = scope == "publication" - is_preprint_scope = scope == "preprint" - funding_model_id = _FUNDING_MODEL_MAP.get(qualifier.lower()) - current_level = (_custom_fields.get("cern:oa_level") or {}).get("id") - current_priority = _OA_LEVEL_PRIORITY.get(current_level, 99) - - new_level = None - if is_cc_by and is_publication_scope: - new_level = "gold" - elif is_bronze: - new_level = "bronze" - elif is_preprint_scope: - new_level = "green" - - if new_level and _OA_LEVEL_PRIORITY[new_level] < current_priority: - _custom_fields["cern:oa_level"] = {"id": new_level} - if funding_model_id and not _custom_fields.get("cern:oa_funding_model"): _custom_fields["cern:oa_funding_model"] = {"id": funding_model_id} # Base license logic: expand repeated 'a' subfields into individual calls # because clean_val raises UnexpectedValue for tuple values by default. + a_vals = force_list(v.get("a")) or () for a_val in a_vals: if not a_val: continue @@ -339,65 +307,6 @@ def oa_level_from_license(self, key, value): raise IgnoreKey("_oa_license") -@model.over("_oa_annual_report", "^595__", override=True) -def oa_level_from_annual_report(self, key, value): - """Detect 'For annual report' → closed OA; also runs base note logic for internal_notes. - - 595__a = 'For annual report': tentatively marks closed access. - If gold/bronze/green was already set by 540 rules, this is skipped. - The 8564 rule can still upgrade tentative 'closed' to green. - """ - for v in force_list(value): - note_text = _sub(v, "a").strip().lower() - if note_text == "for annual report": - _custom_fields = self.get("custom_fields", {}) - if not _custom_fields.get("cern:oa_level"): - _custom_fields["cern:oa_level"] = {"id": "closed"} - self["custom_fields"] = _custom_fields - - # Delegate base note logic — raises IgnoreKey("internal_notes") on success - try: - _raw_note(self, key, v) - except IgnoreKey: - pass - - raise IgnoreKey("_oa_annual_report") - - -@model.over("_oa_url", "^8564[1_]", override=True) -def oa_level_from_url(self, key, value): - """Detect green OA from preprint/manuscript file links; also runs base URL logic. - - 8564_y: 'preprint' or 'manuscript' → green OA level. - Overrides tentative 'closed' (from 595 rule) but not gold/bronze/green already set. - """ - rel_ids = self.get("related_identifiers", []) - - for v in force_list(value): - sub_y = _sub(v, "y").strip().lower() - if sub_y in ["preprint", "manuscript"]: - _custom_fields = self.get("custom_fields", {}) - current_level = (_custom_fields.get("cern:oa_level") or {}).get("id") - current_priority = _OA_LEVEL_PRIORITY.get(current_level, 99) - if _OA_LEVEL_PRIORITY["green"] < current_priority: - _custom_fields["cern:oa_level"] = {"id": "green"} - self["custom_fields"] = _custom_fields - - # Delegate base URL logic — requires self["recid"] which is always set - # in production (001 field), but may be absent in unit tests. - if "recid" in self: - try: - url_result = _raw_urls(self, key, v) - if url_result and url_result not in rel_ids: - rel_ids.append(url_result) - except IgnoreKey: - pass - - if rel_ids: - self["related_identifiers"] = rel_ids - raise IgnoreKey("_oa_url") - - @model.over("access_grants", "^506[1_]_") @for_each_value def access_grants(self, key, value): diff --git a/tests/cds-rdm/test_publications_rules.py b/tests/cds-rdm/test_publications_rules.py index 1df91c85..edcfda54 100644 --- a/tests/cds-rdm/test_publications_rules.py +++ b/tests/cds-rdm/test_publications_rules.py @@ -17,9 +17,7 @@ isbn, issn, journal, - oa_level_from_annual_report, oa_level_from_license, - oa_level_from_url, udc, ) @@ -361,106 +359,39 @@ def test_journal_issue_without_volume(self): assert journal_info["issue"] == "5" -class TestOaLevelFromLicense: +class TestLicenseAndFundingFrom540: """Tests for oa_level_from_license (540__ rule).""" def _cf(self, record): return record.get("custom_fields", {}) - # --- Gold --- - - def test_gold_cc_by_with_publication_scope(self): + def test_cc_by_license_added_to_rights(self): record = {"custom_fields": {}} with pytest.raises(IgnoreKey): oa_level_from_license(record, "540__", {"a": "CC BY", "3": "publication"}) - assert self._cf(record)["cern:oa_level"] == {"id": "gold"} + assert record["rights"] == [{"id": "cc-by"}] + assert "cern:oa_level" not in self._cf(record) - def test_gold_cc_hyphen_by_with_publication_scope(self): + def test_cc_hyphen_by_license_added_to_rights(self): record = {"custom_fields": {}} with pytest.raises(IgnoreKey): oa_level_from_license(record, "540__", {"a": "CC-BY", "3": "publication"}) - assert self._cf(record)["cern:oa_level"] == {"id": "gold"} - - def test_cc_by_without_publication_scope_is_not_gold(self): - """CC BY alone (no 540__3='publication') should not set gold.""" - record = {"custom_fields": {}} - with pytest.raises(IgnoreKey): - oa_level_from_license(record, "540__", {"a": "CC BY"}) + assert record["rights"] == [{"title": {"en": "CC-BY"}}] assert "cern:oa_level" not in self._cf(record) - def test_cc_by_with_preprint_scope_is_not_gold(self): - """CC BY with 540__3='preprint' → green, not gold.""" - record = {"custom_fields": {}} - with pytest.raises(IgnoreKey): - oa_level_from_license(record, "540__", {"a": "CC BY", "3": "preprint"}) - assert self._cf(record).get("cern:oa_level") == {"id": "green"} - - def test_gold_takes_priority_over_bronze(self): - """Gold in second 540 tag overrides bronze already set.""" - record = {"custom_fields": {}} - with pytest.raises(IgnoreKey): - oa_level_from_license( - record, - "540__", - [{"f": "Bronze"}, {"a": "CC BY", "3": "publication"}], - ) - assert self._cf(record)["cern:oa_level"] == {"id": "gold"} - - # --- Bronze --- - - def test_bronze(self): - record = {"custom_fields": {}} - with pytest.raises(IgnoreKey): - oa_level_from_license(record, "540__", {"f": "Bronze"}) - assert self._cf(record)["cern:oa_level"] == {"id": "bronze"} - - def test_bronze_case_insensitive(self): - record = {"custom_fields": {}} - with pytest.raises(IgnoreKey): - oa_level_from_license(record, "540__", {"f": "bronze"}) - assert self._cf(record)["cern:oa_level"] == {"id": "bronze"} - - def test_bronze_does_not_override_gold(self): - record = {"custom_fields": {"cern:oa_level": {"id": "gold"}}} - with pytest.raises(IgnoreKey): - oa_level_from_license(record, "540__", {"f": "Bronze"}) - assert self._cf(record)["cern:oa_level"] == {"id": "gold"} - - # --- Green --- - - def test_green_from_preprint_scope(self): - record = {"custom_fields": {}} - with pytest.raises(IgnoreKey): - oa_level_from_license(record, "540__", {"3": "Preprint"}) - assert self._cf(record)["cern:oa_level"] == {"id": "green"} - - def test_green_preprint_scope_case_insensitive(self): - record = {"custom_fields": {}} - with pytest.raises(IgnoreKey): - oa_level_from_license(record, "540__", {"3": "preprint"}) - assert self._cf(record)["cern:oa_level"] == {"id": "green"} - - def test_green_does_not_override_bronze(self): - record = {"custom_fields": {"cern:oa_level": {"id": "bronze"}}} - with pytest.raises(IgnoreKey): - oa_level_from_license(record, "540__", {"3": "preprint"}) - assert self._cf(record)["cern:oa_level"] == {"id": "bronze"} - - # --- No OA marker --- - - def test_no_oa_marker_sets_nothing(self): + def test_non_standard_license_added_to_rights(self): record = {"custom_fields": {}} with pytest.raises(IgnoreKey): oa_level_from_license(record, "540__", {"a": "Some other license"}) + assert record["rights"] == [{"title": {"en": "Some other license"}}] assert "cern:oa_level" not in self._cf(record) - # --- Funding model --- - def test_funding_model_scoap3(self): record = {"custom_fields": {}} with pytest.raises(IgnoreKey): oa_level_from_license(record, "540__", {"f": "SCOAP3"}) assert self._cf(record)["cern:oa_funding_model"] == {"id": "scoap3"} + assert "cern:oa_level" not in self._cf(record) def test_funding_model_collective(self): record = {"custom_fields": {}} @@ -487,11 +418,11 @@ def test_funding_model_other(self): assert self._cf(record)["cern:oa_funding_model"] == {"id": "other"} def test_bronze_does_not_set_funding_model(self): - """Bronze is an OA level, not a funding model.""" record = {"custom_fields": {}} with pytest.raises(IgnoreKey): oa_level_from_license(record, "540__", {"f": "Bronze"}) assert "cern:oa_funding_model" not in self._cf(record) + assert "cern:oa_level" not in self._cf(record) def test_funding_model_not_overwritten_by_second_tag(self): """First funding model found wins.""" @@ -501,150 +432,3 @@ def test_funding_model_not_overwritten_by_second_tag(self): record, "540__", [{"f": "SCOAP3"}, {"f": "Collective"}] ) assert self._cf(record)["cern:oa_funding_model"] == {"id": "scoap3"} - - -class TestOaLevelFromAnnualReport: - """Tests for oa_level_from_annual_report (595__ rule).""" - - def _cf(self, record): - return record.get("custom_fields", {}) - - def test_for_annual_report_sets_closed(self): - record = {"custom_fields": {}} - with pytest.raises(IgnoreKey): - oa_level_from_annual_report(record, "595__", {"a": "For annual report"}) - assert self._cf(record)["cern:oa_level"] == {"id": "closed"} - - def test_for_annual_report_case_insensitive(self): - record = {"custom_fields": {}} - with pytest.raises(IgnoreKey): - oa_level_from_annual_report(record, "595__", {"a": "for annual report"}) - assert self._cf(record)["cern:oa_level"] == {"id": "closed"} - - def test_does_not_override_existing_gold(self): - record = {"custom_fields": {"cern:oa_level": {"id": "gold"}}} - with pytest.raises(IgnoreKey): - oa_level_from_annual_report(record, "595__", {"a": "For annual report"}) - assert self._cf(record)["cern:oa_level"] == {"id": "gold"} - - def test_does_not_override_existing_bronze(self): - record = {"custom_fields": {"cern:oa_level": {"id": "bronze"}}} - with pytest.raises(IgnoreKey): - oa_level_from_annual_report(record, "595__", {"a": "For annual report"}) - assert self._cf(record)["cern:oa_level"] == {"id": "bronze"} - - def test_does_not_override_existing_green(self): - record = {"custom_fields": {"cern:oa_level": {"id": "green"}}} - with pytest.raises(IgnoreKey): - oa_level_from_annual_report(record, "595__", {"a": "For annual report"}) - assert self._cf(record)["cern:oa_level"] == {"id": "green"} - - def test_other_595_note_sets_nothing(self): - record = {"custom_fields": {}} - with pytest.raises(IgnoreKey): - oa_level_from_annual_report(record, "595__", {"a": "Not for annual report"}) - assert "cern:oa_level" not in self._cf(record) - - def test_unrelated_note_sets_nothing(self): - record = {"custom_fields": {}} - with pytest.raises(IgnoreKey): - oa_level_from_annual_report(record, "595__", {"a": "Some random note"}) - assert "cern:oa_level" not in self._cf(record) - - -class TestOaLevelFromUrl: - """Tests for oa_level_from_url (8564 rule).""" - - def _cf(self, record): - return record.get("custom_fields", {}) - - def test_preprint_url_sets_green(self): - record = {"custom_fields": {}} - with pytest.raises(IgnoreKey): - oa_level_from_url( - record, - "8564_", - {"y": "preprint", "u": "http://example.com/preprint.pdf"}, - ) - assert self._cf(record)["cern:oa_level"] == {"id": "green"} - - def test_manuscript_url_sets_green(self): - record = {"custom_fields": {}} - with pytest.raises(IgnoreKey): - oa_level_from_url( - record, "8564_", {"y": "manuscript", "u": "http://example.com/ms.pdf"} - ) - assert self._cf(record)["cern:oa_level"] == {"id": "green"} - - def test_preprint_url_case_insensitive(self): - record = {"custom_fields": {}} - with pytest.raises(IgnoreKey): - oa_level_from_url( - record, - "8564_", - {"y": "Preprint", "u": "http://example.com/preprint.pdf"}, - ) - assert self._cf(record)["cern:oa_level"] == {"id": "green"} - - def test_preprint_overrides_closed(self): - """A preprint link should upgrade a tentative 'closed' to green.""" - record = {"custom_fields": {"cern:oa_level": {"id": "closed"}}} - with pytest.raises(IgnoreKey): - oa_level_from_url( - record, - "8564_", - {"y": "preprint", "u": "http://example.com/preprint.pdf"}, - ) - assert self._cf(record)["cern:oa_level"] == {"id": "green"} - - def test_manuscript_overrides_closed(self): - record = {"custom_fields": {"cern:oa_level": {"id": "closed"}}} - with pytest.raises(IgnoreKey): - oa_level_from_url( - record, "8564_", {"y": "manuscript", "u": "http://example.com/ms.pdf"} - ) - assert self._cf(record)["cern:oa_level"] == {"id": "green"} - - def test_preprint_does_not_override_gold(self): - record = {"custom_fields": {"cern:oa_level": {"id": "gold"}}} - with pytest.raises(IgnoreKey): - oa_level_from_url( - record, - "8564_", - {"y": "preprint", "u": "http://example.com/preprint.pdf"}, - ) - assert self._cf(record)["cern:oa_level"] == {"id": "gold"} - - def test_preprint_does_not_override_bronze(self): - record = {"custom_fields": {"cern:oa_level": {"id": "bronze"}}} - with pytest.raises(IgnoreKey): - oa_level_from_url( - record, - "8564_", - {"y": "preprint", "u": "http://example.com/preprint.pdf"}, - ) - assert self._cf(record)["cern:oa_level"] == {"id": "bronze"} - - def test_preprint_does_not_override_green(self): - record = {"custom_fields": {"cern:oa_level": {"id": "green"}}} - with pytest.raises(IgnoreKey): - oa_level_from_url( - record, - "8564_", - {"y": "preprint", "u": "http://example.com/preprint.pdf"}, - ) - assert self._cf(record)["cern:oa_level"] == {"id": "green"} - - def test_non_oa_url_label_sets_nothing(self): - record = {"custom_fields": {}} - with pytest.raises(IgnoreKey): - oa_level_from_url( - record, "8564_", {"y": "fulltext", "u": "http://example.com/paper.pdf"} - ) - assert "cern:oa_level" not in self._cf(record) - - def test_no_y_subfield_sets_nothing(self): - record = {"custom_fields": {}} - with pytest.raises(IgnoreKey): - oa_level_from_url(record, "8564_", {"u": "http://example.com/paper.pdf"}) - assert "cern:oa_level" not in self._cf(record)