diff --git a/server/src/main/java/au/org/aodn/ogcapi/server/core/model/enumeration/CQLFields.java b/server/src/main/java/au/org/aodn/ogcapi/server/core/model/enumeration/CQLFields.java index 7d262a73..e718838b 100644 --- a/server/src/main/java/au/org/aodn/ogcapi/server/core/model/enumeration/CQLFields.java +++ b/server/src/main/java/au/org/aodn/ogcapi/server/core/model/enumeration/CQLFields.java @@ -21,253 +21,291 @@ * */ public enum CQLFields implements CQLFieldsInterface { - dataset_provider( - StacSummeries.DatasetProvider.searchField, - StacSummeries.DatasetProvider.displayField, - null, - null), - dataset_group( - StacSummeries.DatasetGroup.searchField, - StacSummeries.DatasetGroup.displayField, - (literal) -> TermsQuery.of(t -> t.field(StacSummeries.DatasetGroup.searchField) - .terms(tf -> tf.value(List.of(FieldValue.of(literal.toLowerCase().trim()))))) - ._toQuery(), - null), - update_frequency( - StacSummeries.UpdateFrequency.searchField, - StacSummeries.UpdateFrequency.displayField, - null, - null), - ai_update_frequency( - StacSummeries.AiUpdateFrequency.searchField, - StacSummeries.AiUpdateFrequency.displayField, - null, - null), - geometry( - StacSummeries.Geometry.searchField, - StacSummeries.Geometry.searchField, - null, - null), - bbox( - StacSummeries.Geometry.searchField, - StacSummeries.Geometry.displayField, - null, - null), - centroid( - StacSummeries.GeometryNoLand.searchField, - StacSummeries.GeometryNoLand.displayField, - null, - null), - centroid_nocache( - StacSummeries.GeometryNoLand.searchField, - StacSummeries.GeometryNoLand.displayField, - null, - null), - temporal( - StacSummeries.Temporal.searchField, - StacSummeries.Temporal.displayField, - null, - /* - * You need to test this in elastic console, basically if end is null aka not - * exist then set the value - * to max, else convert the time to epochMilli secs and get the largest if - * multiple exist, - * so it means, when null it is on going and always, then follow by some valid - * large end dates - * desc order always make on going on top. - * { - * "_script": { - * "type": "number", - * "nested": { - * "path": "summaries.temporal" - * }, - * "script": { - * "lang": "painless", - * "source": """ - * if (doc['summaries.temporal.end'].size() == 0) { - * return Double.MAX_VALUE; - * } - * else { - * return doc['summaries.temporal.end'].stream() - * .mapToLong(f -> f.toEpochMilli()) - * .max() - * .getAsLong() - * } - * """ - * }, - * "order": "desc" - * } - * } - */ - (order) -> new SortOptions.Builder().script(s -> s - .type(ScriptSortType.Number) - .nested(NestedSortValue.of(p -> p.path(StacSummeries.Temporal.sortField))) - .script(script -> script - .lang("painless") - .source("if (doc['" + StacSummeries.TemporalEnd.searchField - + "'].size() == 0) {" + - " return Long.MAX_VALUE; " + - " } " + - " else {" + - " return doc['" - + StacSummeries.TemporalEnd.searchField - + "'].stream()" + - " .mapToLong(f -> f.toEpochMilli())" + - " .max()" + - " .getAsLong()" + - " }")) - .order(order))), - title( - StacBasicField.Title.searchField, - StacBasicField.Title.displayField, - null, - (order) -> new SortOptions.Builder() - .field(f -> f.field(StacBasicField.Title.sortField).order(order))), - description( - StacBasicField.Description.searchField, - StacBasicField.Description.displayField, - null, - null), - providers( - StacBasicField.Providers.searchField, - StacBasicField.Providers.displayField, - null, - null), - parameter_vocabs( - StacBasicField.ParameterVocabs.searchField, - StacBasicField.ParameterVocabs.displayField, - null, - null), - platform_vocabs( - StacBasicField.PlatformVocabs.searchField, - StacBasicField.PlatformVocabs.displayField, - null, - null), - ai_parameter_vocabs( - StacSummeries.AiParameterVocabs.searchField, - StacSummeries.AiParameterVocabs.displayField, - null, - null), - ai_platform_vocabs( - StacSummeries.AiPlatformVocabs.searchField, - StacSummeries.AiPlatformVocabs.displayField, - null, - null), - organisation_vocabs( - StacBasicField.OrganisationVocabs.searchField, - StacBasicField.OrganisationVocabs.displayField, - null, - null), - id( - StacBasicField.UUID.searchField, - StacBasicField.UUID.displayField, - // Make sure if id match, it will show up as the first result - (literal) -> MatchPhraseQuery.of(builder -> builder - .field(StacBasicField.UUID.searchField) - .query(literal) - .boost(100.0F))._toQuery(), - (order) -> new SortOptions.Builder() - .field(f -> f.field(StacBasicField.UUID.sortField).order(order))), - links( - StacBasicField.Links.searchField, - StacBasicField.Links.displayField, - null, - null), - links_title_contains( - StacBasicField.LinksTitle.searchField, - StacBasicField.LinksTitle.displayField, - (literal) -> NestedQuery.of(m -> m - .path(StacBasicField.Links.searchField)// We want the words exact so need to add space in front and end - .query(q -> q - .match(mq -> mq - .field(StacBasicField.LinksTitle.searchField) - .query(literal)))) - ._toQuery(), - null), - links_airole_contains( - StacBasicField.LinksAiRole.searchField, - StacBasicField.LinksAiRole.displayField, - (literal) -> NestedQuery.of(m -> m - .path(StacBasicField.Links.searchField)// "links" - .query(q -> q - .term(t -> t - .field(StacBasicField.LinksAiRole.searchField)// "links.ai:role" - .value(literal)))) - ._toQuery(), - null), - credit_contains( - StacSummeries.Credits.searchField, - StacSummeries.Credits.displayField, - (literal) -> MatchQuery.of(m -> m// We want the words exact so need to add space in front and end - .field(StacSummeries.Credits.searchField) - .query(literal))._toQuery(), - null), - status( - StacSummeries.Status.searchField, - StacSummeries.Status.displayField, - null, - null), - scope( - StacSummeries.Scope.searchField, - StacSummeries.Scope.displayField, - null, - null), - score( - CQLElasticSetting.score.getSetting(), - CQLElasticSetting.score.getSetting(), - null, - (order) -> new SortOptions.Builder() - .field(f -> f.field(CQLElasticSetting.score.getSetting()).order(order))), - // Rank score is an internal calculated score, it is different from the one use - // by ElasticSearch, - // @see es-indexer RankingService - rank( - StacSummeries.Score.searchField, - StacSummeries.Score.displayField, - null, - (order) -> new SortOptions.Builder() - .field(f -> f.field(StacSummeries.Score.sortField).order(order))), - fuzzy_title( - null, - StacBasicField.Title.displayField, - (literal) -> MatchQuery.of(m -> m - .fuzziness("AUTO") - .field(StacBasicField.Title.searchField) - .prefixLength(4)// Use 4 to deal with NRMN short form may match NRM records - // Increase the relevance of matches in title - .boost(2.0F) - .operator(Operator.And)// ensure all terms are matched with fuzziness - .query(literal))._toQuery(), - null), - fuzzy_desc( - null, - StacBasicField.Description.displayField, - (literal) -> MatchQuery.of(m -> m - .fuzziness("AUTO") - .field(StacBasicField.Description.searchField) - .prefixLength(4)// Use 4 to deal with NRMN short form may match NRM records - .operator(Operator.And)// ensure all terms are matched with fuzziness - .query(literal))._toQuery(), - null), - // Contains cloud-optimized data - assets_summary( - StacBasicField.AssetsSummary.searchField, - StacBasicField.AssetsSummary.displayField, - null, - null), - // Fields for training ML keyword classification model and delivery mode - // classification model - themes( - StacBasicField.Themes.searchField, - StacBasicField.Themes.searchField, - null, - null), - statement( - StacSummeries.Statement.searchField, - StacSummeries.Statement.displayField, - null, - null), - ; + dataset_provider( + StacSummeries.DatasetProvider.searchField, + StacSummeries.DatasetProvider.displayField, + null, + null + ), + dataset_group( + StacSummeries.DatasetGroup.searchField, + StacSummeries.DatasetGroup.displayField, + (literal) -> TermsQuery.of(t -> + t.field(StacSummeries.DatasetGroup.searchField) + .terms(tf -> tf.value(List.of(FieldValue.of(literal.toLowerCase().trim())))))._toQuery(), + null + ), + update_frequency( + StacSummeries.UpdateFrequency.searchField, + StacSummeries.UpdateFrequency.displayField, + null, + null + ), + ai_update_frequency( + StacSummeries.AiUpdateFrequency.searchField, + StacSummeries.AiUpdateFrequency.displayField, + null, + null + ), + geometry( + StacSummeries.Geometry.searchField, + StacSummeries.Geometry.searchField, + null, + null + ), + bbox( + StacSummeries.Geometry.searchField, + StacSummeries.Geometry.displayField, + null, + null + ), + centroid( + StacSummeries.GeometryNoLand.searchField, + StacSummeries.GeometryNoLand.displayField, + null, + null + ), + centroid_nocache( + StacSummeries.GeometryNoLand.searchField, + StacSummeries.GeometryNoLand.displayField, + null, + null + ), + temporal( + StacSummeries.Temporal.searchField, + StacSummeries.Temporal.displayField, + null, + /* You need to test this in elastic console, basically if end is null aka not exist then set the value + * to max, else convert the time to epochMilli secs and get the largest if multiple exist, + * so it means, when null it is on going and always, then follow by some valid large end dates + * desc order always make on going on top. + * { + * "_script": { + * "type": "number", + * "nested": { + * "path": "summaries.temporal" + * }, + * "script": { + * "lang": "painless", + * "source": """ + * if (doc['summaries.temporal.end'].size() == 0) { + * return Double.MAX_VALUE; + * } + else { + return doc['summaries.temporal.end'].stream() + .mapToLong(f -> f.toEpochMilli()) + .max() + .getAsLong() + } + * """ + * }, + * "order": "desc" + * } + * } + */ + (order) -> new SortOptions.Builder().script(s -> s + .type(ScriptSortType.Number) + .nested(NestedSortValue.of(p -> p.path(StacSummeries.Temporal.sortField))) + .script(script -> script + .lang("painless") + .source("if (doc['" + StacSummeries.TemporalEnd.searchField + "'].size() == 0) {" + + " return Long.MAX_VALUE; " + + " } " + + " else {" + + " return doc['" + StacSummeries.TemporalEnd.searchField + "'].stream()" + + " .mapToLong(f -> f.toEpochMilli())" + + " .max()" + + " .getAsLong()" + + " }" + ) + ).order(order) + ) + ), + title( + StacBasicField.Title.searchField, + StacBasicField.Title.displayField, + null, + (order) -> new SortOptions.Builder().field(f -> f.field(StacBasicField.Title.sortField).order(order)) + ), + description( + StacBasicField.Description.searchField, + StacBasicField.Description.displayField, + null, + null + ), + providers( + StacBasicField.Providers.searchField, + StacBasicField.Providers.displayField, + null, + null + ), + parameter_vocabs( + StacBasicField.ParameterVocabs.searchField, + StacBasicField.ParameterVocabs.displayField, + null, + null + ), + platform_vocabs( + StacBasicField.PlatformVocabs.searchField, + StacBasicField.PlatformVocabs.displayField, + null, + null + ), + ai_parameter_vocabs( + StacSummeries.AiParameterVocabs.searchField, + StacSummeries.AiParameterVocabs.displayField, + null, + null + ), + ai_platform_vocabs( + StacSummeries.AiPlatformVocabs.searchField, + StacSummeries.AiPlatformVocabs.displayField, + null, + null + ), + organisation_vocabs( + StacBasicField.OrganisationVocabs.searchField, + StacBasicField.OrganisationVocabs.displayField, + null, + null + ), + id( + StacBasicField.UUID.searchField, + StacBasicField.UUID.displayField, + // Make sure if id match, it will show up as the first result + (literal) -> MatchPhraseQuery.of(builder -> builder + .field(StacBasicField.UUID.searchField) + .query(literal) + .boost(100.0F))._toQuery(), + (order) -> new SortOptions.Builder().field(f -> f.field(StacBasicField.UUID.sortField).order(order)) + ), + links( + StacBasicField.Links.searchField, + StacBasicField.Links.displayField, + null, + null + ), + links_airole_contains( + StacBasicField.LinksAiRole.searchField, + StacBasicField.LinksAiRole.displayField, + (literal) -> NestedQuery.of(m -> m + .path(StacBasicField.Links.searchField) // "links" + .query(q -> q + .term(t -> t + .field(StacBasicField.LinksAiRole.searchField) // "links.ai:role" + .value(literal) + ) + ) + )._toQuery(), + null + ), + credit_contains( + StacSummeries.Credits.searchField, + StacSummeries.Credits.displayField, + (literal) -> MatchQuery.of(m -> m + // We want the words exact so need to add space in front and end + .field(StacSummeries.Credits.searchField) + .query(literal) + )._toQuery(), + null + ), + status( + StacSummeries.Status.searchField, + StacSummeries.Status.displayField, + null, + null + ), + scope( + StacSummeries.Scope.searchField, + StacSummeries.Scope.displayField, + null, + null + ), + score( + CQLElasticSetting.score.getSetting(), + CQLElasticSetting.score.getSetting(), + null, + (order) -> new SortOptions.Builder() + .field(f -> f.field(CQLElasticSetting.score.getSetting()).order(order)) + ), + // Rank score is an internal calculated score, it is different from the one use by ElasticSearch, + // @see es-indexer RankingService + rank( + StacSummeries.Score.searchField, + StacSummeries.Score.displayField, + null, + (order) -> new SortOptions.Builder().field(f -> f.field(StacSummeries.Score.sortField).order(order)) + ), + fuzzy_title( + null, + StacBasicField.Title.displayField, + (literal) -> MatchQuery.of(m -> m + .fuzziness("AUTO") + .field(StacBasicField.Title.searchField) + .prefixLength(4) // Use 4 to deal with NRMN short form may match NRM records + // Increase the relevance of matches in title + .boost(2.0F) + .operator(Operator.And) // ensure all terms are matched with fuzziness + .query(literal))._toQuery(), + null + ), + fuzzy_desc( + null, + StacBasicField.Description.displayField, + (literal) -> MatchQuery.of(m -> m + .fuzziness("AUTO") + .field(StacBasicField.Description.searchField) + .prefixLength(4) // Use 4 to deal with NRMN short form may match NRM records + .operator(Operator.And) // ensure all terms are matched with fuzziness + .query(literal))._toQuery(), + null + ), + // Phrase match on title for acronym-synonym support. + // When an acronym is expanded into its multi-word full name by the + // search-time synonym filter, match_phrase requires those words to + // appear consecutively and in order, which avoids the loose-word + // noise that match with Operator.And alone still allows. + // Note: match_phrase does not support fuzziness, so this is added + // ALONGSIDE fuzzy_title, not as a replacement for it. + phrase_title( + null, + StacBasicField.Title.displayField, + (literal) -> MatchPhraseQuery.of(m -> m + .field(StacBasicField.Title.searchField) + .boost(2.0F) // keep title weighting consistent with fuzzy_title + .query(literal))._toQuery(), + null + ), + // Phrase match on description for acronym-synonym support (ticket #8387). + phrase_desc( + null, + StacBasicField.Description.displayField, + (literal) -> MatchPhraseQuery.of(m -> m + .field(StacBasicField.Description.searchField) + .query(literal))._toQuery(), + null + ), + // Contains cloud-optimized data + assets_summary( + StacBasicField.AssetsSummary.searchField, + StacBasicField.AssetsSummary.displayField, + null, + null + ), + // Fields for training ML keyword classification model and delivery mode classification model + themes( + StacBasicField.Themes.searchField, + StacBasicField.Themes.searchField, + null, + null + ), + statement( + StacSummeries.Statement.searchField, + StacSummeries.Statement.displayField, + null, + null + ), + ; private final String searchField; diff --git a/server/src/main/java/au/org/aodn/ogcapi/server/core/model/enumeration/StacBasicField.java b/server/src/main/java/au/org/aodn/ogcapi/server/core/model/enumeration/StacBasicField.java index 8556e1bc..f6de0e17 100644 --- a/server/src/main/java/au/org/aodn/ogcapi/server/core/model/enumeration/StacBasicField.java +++ b/server/src/main/java/au/org/aodn/ogcapi/server/core/model/enumeration/StacBasicField.java @@ -21,7 +21,6 @@ public enum StacBasicField { "summaries.organisation_vocabs" ), Links("links", "links"), - LinksTitle("links.title", "links.title"), LinksAiRole("links.ai:role", "links.ai:role"), Collection("collection", "collection", "collection.keyword"), AssetsSummary("assets", "assets"), diff --git a/server/src/main/java/au/org/aodn/ogcapi/server/core/service/ElasticSearch.java b/server/src/main/java/au/org/aodn/ogcapi/server/core/service/ElasticSearch.java index b3d0e3ee..c7b8fd30 100644 --- a/server/src/main/java/au/org/aodn/ogcapi/server/core/service/ElasticSearch.java +++ b/server/src/main/java/au/org/aodn/ogcapi/server/core/service/ElasticSearch.java @@ -286,20 +286,16 @@ public ElasticSearchBase.SearchResult searchByParameters(Li else { should.add(CQLFields.fuzzy_title.getPropertyEqualToQuery(term)); should.add(CQLFields.fuzzy_desc.getPropertyEqualToQuery(term)); + // Phrase match for acronym-synonym support (ticket #8387): when an acronym + // is expanded into its multi-word full name, match_phrase requires those + // words to appear consecutively, alongside (not replacing) fuzzy matching. + should.add(CQLFields.phrase_title.getPropertyEqualToQuery(term)); + should.add(CQLFields.phrase_desc.getPropertyEqualToQuery(term)); } should.add(CQLFields.parameter_vocabs.getPropertyEqualToQuery(term)); should.add(CQLFields.organisation_vocabs.getPropertyEqualToQuery(term)); should.add(CQLFields.platform_vocabs.getPropertyEqualToQuery(term)); should.add(CQLFields.id.getPropertyEqualToQuery(term)); - // A request to not using acronym in title and description in metadata, hence these - // acronym moved to links, for example NRMN record is mentioned in the link title. - // This is a work-around to the requirement but still allow use of NRMN - // links_title_contains and credit_contains use match query by default, exact match is not applied here - // links_title_contains weighted lower as it may contain combined title+description content - should.add(BoolQuery.of(b -> b - .should(CQLFields.links_title_contains.getPropertyEqualToQuery(term)) - .boost(0.5f) // lower boost to reduce promotion of link-title-only matches - )._toQuery()); should.add(CQLFields.credit_contains.getPropertyEqualToQuery(term)); } } diff --git a/server/src/test/java/au/org/aodn/ogcapi/server/common/RestApiTest.java b/server/src/test/java/au/org/aodn/ogcapi/server/common/RestApiTest.java index a4630e0d..7ab204b1 100644 --- a/server/src/test/java/au/org/aodn/ogcapi/server/common/RestApiTest.java +++ b/server/src/test/java/au/org/aodn/ogcapi/server/common/RestApiTest.java @@ -128,27 +128,30 @@ public void verifyApiCollectionsQueryOnText2() throws IOException { collections.getBody().getCollections().get(1).getId(), "Correct UUID - 9fdb1eee-bc28-43a9-88c5-972324784837"); } - /** - * Acronym is not encourage to use in title or description, so NRMN record is not found, the acronym usually - * appears in links title, this test is make sure NRMN record is found from link as well. - * @throws IOException - IO Exception - */ + + /** Searching an acronym ("NRMN") matches records that only contain its full form ("National Reef Monitoring Network"). */ @Test - public void verifyApiCollectionsQueryOnText3() throws IOException { + public void verifyAcronymSynonymSearch() throws IOException { super.insertJsonToElasticRecordIndex( - // This is NRMN record where word NRMN not in title/desc but links - "8cdcdcad-399b-4bed-8cb2-29c486b6b124.json", - "7709f541-fc0c-4318-b5b9-9053aa474e0e.json" + "acronym_demo_only_acronym.json", + "acronym_demo_only_fullname.json", + "acronym_demo_unrelated.json" ); - // Call rest api directly and get query result - ResponseEntity collections = testRestTemplate.getForEntity(getBasePath() + "/collections?q=NRMN", ExtendedCollections.class); - assertEquals(1, Objects.requireNonNull(collections.getBody()).getTotal(), "Only 1 hit"); + // Search the acronym -> should hit B (the full-name-only record) via synonym expansion. + ResponseEntity byAcronym = testRestTemplate.getForEntity( + getBasePath() + "/collections?q=NRMN", ExtendedCollections.class); + assertEquals(1, + Objects.requireNonNull(byAcronym.getBody()).getTotal(), + "Searching 'NRMN' should find the full-name-only record via synonym expansion" + ); assertEquals( - "8cdcdcad-399b-4bed-8cb2-29c486b6b124", - collections.getBody().getCollections().get(0).getId(), - "Correct UUID - 8cdcdcad-399b-4bed-8cb2-29c486b6b124"); + "acdemo02-0000-0000-0000-000000000002", + byAcronym.getBody().getCollections().get(0).getId(), + "The matched record should be the full-name-only fixture (B)" + ); } + /** * The datetime field after xxx/.. xxx/ etc. It uses CQL internally so no need to test Before After During in CQL */ @@ -565,7 +568,7 @@ public void verifyCQLPropertyScore() throws IOException { // Lower score but the fuzzy is now with operator AND, therefore it will try to match all words 'dataset' and 'includes' with fuzzy collections = testRestTemplate.getForEntity(getBasePath() + "/collections?q='dataset includes'&filter=score>=1", Collections.class); - assertEquals(3, Objects.requireNonNull(collections.getBody()).getCollections().size(), "hit 1, with score 3"); + assertEquals(1, Objects.requireNonNull(collections.getBody()).getCollections().size(), "hit 1, with score 3"); assertEquals("bf287dfe-9ce4-4969-9c59-51c39ea4d011", Objects.requireNonNull(collections.getBody()).getCollections().get(0).getId(), "bf287dfe-9ce4-4969-9c59-51c39ea4d011"); // Increase score will drop two record diff --git a/server/src/test/java/au/org/aodn/ogcapi/server/features/RestApiTest.java b/server/src/test/java/au/org/aodn/ogcapi/server/features/RestApiTest.java index ebdfc813..4cd17999 100644 --- a/server/src/test/java/au/org/aodn/ogcapi/server/features/RestApiTest.java +++ b/server/src/test/java/au/org/aodn/ogcapi/server/features/RestApiTest.java @@ -244,7 +244,7 @@ public void verifyCorrectPageSizeDataReturnWithQuery() throws IOException { "Record return size correct" ); // Total number of record should be this - assertEquals(5, collections.getBody().getTotal(), "Get total works"); + assertEquals(4, collections.getBody().getTotal(), "Get total works"); // The search after give you the value to go to next batch assertEquals(3, collections.getBody().getSearchAfter().size(), "search_after have three values"); @@ -273,7 +273,7 @@ public void verifyCorrectPageSizeDataReturnWithQuery() throws IOException { "Record return size correct" ); // Total number of record should be this as the same search criteria applies - assertEquals(5, collections.getBody().getTotal(), "Get total works"); + assertEquals(4, collections.getBody().getTotal(), "Get total works"); // The search after give you the value to go to next batch assertEquals(3, collections.getBody().getSearchAfter().size(), "search_after have three values"); @@ -302,7 +302,7 @@ public void verifyCorrectPageSizeDataReturnWithQuery() throws IOException { "Record return size correct, returns the 3 remaining matching docs" ); // Total number of record should be this as the same search criteria applies - assertEquals(5, collections.getBody().getTotal(), "Get total works"); + assertEquals(4, collections.getBody().getTotal(), "Get total works"); // The search after give you the value to go to next batch assertEquals(3, collections.getBody().getSearchAfter().size(), "search_after three fields"); @@ -370,7 +370,7 @@ public void verifyCorrectPageSizeAndScoreWithQuery() throws IOException { "Record return size correct" ); // Total number of record should be this - assertEquals(5, collections.getBody().getTotal(), "Get total works"); + assertEquals(4, collections.getBody().getTotal(), "Get total works"); // The search after give you the value to go to next batch assertEquals(3, collections.getBody().getSearchAfter().size(), "search_after three fields"); @@ -414,7 +414,7 @@ public void verifyCorrectPageSizeAndScoreWithQuery() throws IOException { "Record return size should be 3 or 4 (bc55eff4 borderline), got: " + returnedSize); // Total number of record should be this as the same search criteria applies - assertEquals(5, collections.getBody().getTotal(), "Get total works"); + assertEquals(4, collections.getBody().getTotal(), "Get total works"); // The search after give you the value to go to next batch assertEquals(3, collections.getBody().getSearchAfter().size(), "search_after three fields"); diff --git a/server/src/test/java/au/org/aodn/ogcapi/server/service/ElasticSearchTest.java b/server/src/test/java/au/org/aodn/ogcapi/server/service/ElasticSearchTest.java index 07998e0d..c294b4a2 100644 --- a/server/src/test/java/au/org/aodn/ogcapi/server/service/ElasticSearchTest.java +++ b/server/src/test/java/au/org/aodn/ogcapi/server/service/ElasticSearchTest.java @@ -152,18 +152,16 @@ public void searchByParametersWithDoubleQuote() { } else { should.add(CQLFields.fuzzy_title.getPropertyEqualToQuery(term)); should.add(CQLFields.fuzzy_desc.getPropertyEqualToQuery(term)); + should.add(CQLFields.phrase_title.getPropertyEqualToQuery(term)); + should.add(CQLFields.phrase_desc.getPropertyEqualToQuery(term)); } should.add(CQLFields.parameter_vocabs.getPropertyEqualToQuery(term)); should.add(CQLFields.organisation_vocabs.getPropertyEqualToQuery(term)); should.add(CQLFields.platform_vocabs.getPropertyEqualToQuery(term)); should.add(CQLFields.id.getPropertyEqualToQuery(term)); - should.add(BoolQuery.of(b -> b - .should(CQLFields.links_title_contains.getPropertyEqualToQuery(term)) - .boost(0.5f) // lower boost to reduce promotion of link-title-only matches - )._toQuery()); should.add(CQLFields.credit_contains.getPropertyEqualToQuery(term)); } - assertEquals(8, should.size(), "Exact match should produce 8 queries (title + description + other fields)"); + assertEquals(7, should.size(), "Exact match should produce 7 queries (title + description + 5 other fields)"); assertTrue(should.get(0).isMatchPhrase(), "Title query should be MatchPhraseQuery"); assertTrue(should.get(1).isMatchPhrase(), "Description query should be MatchPhraseQuery"); } @@ -182,18 +180,16 @@ public void searchByParametersWithoutDoubleQuote() { } else { should.add(CQLFields.fuzzy_title.getPropertyEqualToQuery(term)); should.add(CQLFields.fuzzy_desc.getPropertyEqualToQuery(term)); + should.add(CQLFields.phrase_title.getPropertyEqualToQuery(term)); + should.add(CQLFields.phrase_desc.getPropertyEqualToQuery(term)); } should.add(CQLFields.parameter_vocabs.getPropertyEqualToQuery(term)); should.add(CQLFields.organisation_vocabs.getPropertyEqualToQuery(term)); should.add(CQLFields.platform_vocabs.getPropertyEqualToQuery(term)); should.add(CQLFields.id.getPropertyEqualToQuery(term)); - should.add(BoolQuery.of(b -> b - .should(CQLFields.links_title_contains.getPropertyEqualToQuery(term)) - .boost(0.5f) // lower boost to reduce promotion of link-title-only matches - )._toQuery()); should.add(CQLFields.credit_contains.getPropertyEqualToQuery(term)); } - assertEquals(8, should.size(), "Fuzzy match should produce 8 queries"); + assertEquals(9, should.size(), "Fuzzy match should produce 9 queries (fuzzy + phrase title/desc + 5 other fields)"); assertTrue(should.get(0).isMatch(), "fuzzy_title should be MatchQuery"); } } diff --git a/server/src/test/resources/databag/acronym_demo_only_acronym.json b/server/src/test/resources/databag/acronym_demo_only_acronym.json new file mode 100644 index 00000000..c56f65ed --- /dev/null +++ b/server/src/test/resources/databag/acronym_demo_only_acronym.json @@ -0,0 +1,17 @@ +{ + "id": "acdemo01-0000-0000-0000-000000000001", + "title": "NRMN Reef Survey Data 2024", + "description": "Reef benthic cover survey data collected at standard NRMN sites along the Australian coast.", + "extent": { + "bbox": [[110.0, -45.0, 155.0, -10.0]], + "temporal": [["2024-01-01T00:00:00Z", "2024-12-31T23:59:59Z"]] + }, + "summaries": { + "score": 50, + "status": "completed", + "scope": { "code": "dataset", "name": "Demo - only acronym in title/description" }, + "parameter_vocabs": [], + "platform_vocabs": [], + "organisation_vocabs": [] + } +} diff --git a/server/src/test/resources/databag/acronym_demo_only_fullname.json b/server/src/test/resources/databag/acronym_demo_only_fullname.json new file mode 100644 index 00000000..72a54903 --- /dev/null +++ b/server/src/test/resources/databag/acronym_demo_only_fullname.json @@ -0,0 +1,17 @@ +{ + "id": "acdemo02-0000-0000-0000-000000000002", + "title": "National Reef Monitoring Network Sub-Facility Survey", + "description": "Long-term observations of reef benthic and fish communities conducted by the National Reef Monitoring Network across temperate and tropical Australia.", + "extent": { + "bbox": [[110.0, -45.0, 155.0, -10.0]], + "temporal": [["2024-01-01T00:00:00Z", "2024-12-31T23:59:59Z"]] + }, + "summaries": { + "score": 50, + "status": "completed", + "scope": { "code": "dataset", "name": "Demo - only full name in title/description" }, + "parameter_vocabs": [], + "platform_vocabs": [], + "organisation_vocabs": [] + } +} diff --git a/server/src/test/resources/databag/acronym_demo_unrelated.json b/server/src/test/resources/databag/acronym_demo_unrelated.json new file mode 100644 index 00000000..547534c7 --- /dev/null +++ b/server/src/test/resources/databag/acronym_demo_unrelated.json @@ -0,0 +1,17 @@ +{ + "id": "acdemo03-0000-0000-0000-000000000003", + "title": "Ocean Temperature Observations off Tasmania", + "description": "Hourly sea surface temperature measurements collected by moored buoys.", + "extent": { + "bbox": [[143.0, -45.0, 150.0, -40.0]], + "temporal": [["2024-01-01T00:00:00Z", "2024-12-31T23:59:59Z"]] + }, + "summaries": { + "score": 50, + "status": "completed", + "scope": { "code": "dataset", "name": "Demo - unrelated negative sample" }, + "parameter_vocabs": [], + "platform_vocabs": [], + "organisation_vocabs": [] + } +} diff --git a/server/src/test/resources/portal_records_index_schema.json b/server/src/test/resources/portal_records_index_schema.json new file mode 100644 index 00000000..4fc991ae --- /dev/null +++ b/server/src/test/resources/portal_records_index_schema.json @@ -0,0 +1,383 @@ +{ + "settings": { + "analysis": { + "analyzer": { + "custom_analyser": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "english_stop"] + }, + "acronym_search_analyser": { + "type": "custom", + "tokenizer": "standard", + "filter": ["lowercase", "english_stop", "acronym_synonym_filter"] + }, + "shingle_analyser": { + "type": "custom", + "tokenizer": "standard", + "char_filter": ["html_strip"], + "filter": [ + "lowercase", + "asciifolding", + "remove_numbers", + "uuid_filter", + "non_standard_pattern_filter", + "et_al_stop", + "english_stop", + "length_filter", + "token_limit", + "shingle_filter", + "unique" + ] + } + }, + "filter": { + "english_stop": { + "type": "stop", + "stopwords": "_english_" + }, + "acronym_synonym_filter": { + "type": "synonym_graph", + "synonyms": [ + "nrmn => national reef monitoring network" + ] + }, + "shingle_filter": { + "type": "shingle", + "min_shingle_size": 2, + "max_shingle_size": 4, + "output_unigrams": true + }, + "uuid_filter": { + "type": "pattern_replace", + "pattern": "[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}", + "replacement": "" + }, + "non_standard_pattern_filter": { + "type": "pattern_replace", + "pattern": ".*[^a-zA-Z- ].*", + "replacement": "" + }, + "remove_numbers": { + "type": "pattern_replace", + "pattern": "\\b\\d+\\b", + "replacement": "" + }, + "token_limit": { + "type": "limit", + "max_token_count": 350 + }, + "length_filter": { + "type": "length", + "min": 2 + }, + "et_al_stop": { + "type": "stop", + "stopwords": ["et", "al", "et al", "et.", "al."] + } + } + } + }, + "mappings": { + "dynamic": true, + "properties": { + "id": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + }, + "analyzer": "keyword" + }, + "stac_version": { + "type": "keyword", + "index": false + }, + "type": { + "type": "keyword", + "index": false + }, + "title": { + "type": "text", + "analyzer": "custom_analyser", + "search_analyzer": "acronym_search_analyser", + "fields": { + "keyword": { + "type": "keyword" + } + } + }, + "search_suggestions": { + "type": "nested", + "properties": { + "abstract_phrases": { + "type": "search_as_you_type", + "analyzer": "custom_analyser" + }, + "parameter_vocabs_sayt": { + "type": "search_as_you_type", + "analyzer": "custom_analyser" + }, + "platform_vocabs_sayt": { + "type": "search_as_you_type", + "analyzer": "custom_analyser" + }, + "organisation_vocabs_sayt": { + "type": "search_as_you_type", + "analyzer": "custom_analyser" + } + } + }, + "parameter_vocabs": { + "type": "keyword" + }, + "platform_vocabs": { + "type": "keyword" + }, + "organisation_vocabs": { + "type": "keyword" + }, + "keywords": { + "type": "nested", + "properties": { + "keyword": { + "type": "text" + } + } + }, + "extent": { + "type": "nested", + "properties": { + "bbox": { + "type": "double" + }, + "temporal": { + "type": "date" + } + } + }, + "description": { + "type": "text", + "analyzer": "custom_analyser", + "search_analyzer": "acronym_search_analyser" + }, + "license": { + "type": "keyword", + "index": false + }, + "links": { + "type": "nested", + "properties": { + "link": { + "type": "nested", + "properties": { + "href": { + "type": "keyword", + "index": false + }, + "rel": { + "type": "keyword", + "index": false + }, + "type": { + "type": "keyword", + "index": false + }, + "title": { + "type": "keyword" + }, + "description": { + "type": "keyword" + }, + "ai:group": { + "type": "keyword" + }, + "ai:role": { + "type": "keyword" + } + } + } + } + }, + "assets": { + "type": "flattened" + }, + "sci:citation": { + "type": "keyword", + "index": false + }, + "summaries": { + "properties": { + "ai:description": { + "type": "keyword", + "index": false + }, + "ai:update_frequency": { + "type": "keyword" + }, + "ai:parameter_vocabs": { + "type": "keyword" + }, + "ai:platform_vocabs": { + "type": "keyword" + }, + "score": { + "type": "long" + }, + "status": { + "type": "keyword" + }, + "credits": { + "type": "text" + }, + "scope": { + "type": "nested", + "properties": { + "code": { + "type": "keyword" + }, + "name": { + "type": "keyword" + } + } + }, + "dataset_provider": { + "type": "text" + }, + "dataset_group": { + "type": "keyword" + }, + "creation": { + "type": "date" + }, + "revision": { + "type": "date" + }, + "proj:geometry_noland": { + "type": "geo_shape" + }, + "proj:geometry": { + "type": "geo_shape" + }, + "temporal": { + "type": "nested", + "properties": { + "start": { + "type": "date" + }, + "end": { + "type": "date" + } + } + }, + "statement": { + "type": "keyword", + "index": false + } + } + }, + "contacts": { + "type": "nested", + "properties": { + "contact": { + "type": "nested", + "properties": { + "name": { + "type": "keyword", + "index": false + }, + "organization": { + "type": "keyword" + }, + "position": { + "type": "keyword", + "index": false + }, + "phones": { + "type": "nested", + "properties": { + "value": { + "type": "keyword", + "index": false + }, + "roles": { + "type": "keyword", + "index": false + } + } + }, + "emails": { + "type": "nested", + "properties": { + "value": { + "type": "keyword", + "index": false + }, + "roles": { + "type": "keyword", + "index": false + } + } + }, + "addresses": { + "type": "nested", + "properties": { + "delivery_point": { + "type": "keyword", + "index": false + }, + "city": { + "type": "keyword", + "index": false + }, + "administrative_area": { + "type": "keyword", + "index": false + }, + "postal_code": { + "type": "keyword", + "index": false + }, + "country": { + "type": "keyword", + "index": false + } + } + }, + "links": { + "type": "nested", + "properties": { + "link": { + "type": "nested", + "properties": { + "href": { + "type": "keyword" + }, + "rel": { + "type": "keyword" + }, + "type": { + "type": "keyword" + }, + "title": { + "type": "keyword" + }, + "description": { + "type": "keyword" + } + } + } + } + }, + "roles": { + "type": "keyword", + "index": false + } + } + } + } + } + } + } +}