diff --git a/src/isp_trace_parser/demand_trace_metadata.py b/src/isp_trace_parser/demand_trace_metadata.py new file mode 100644 index 0000000..c0124ef --- /dev/null +++ b/src/isp_trace_parser/demand_trace_metadata.py @@ -0,0 +1,53 @@ +from pathlib import Path + +from isp_trace_parser import mappings + + +def build( + files: list[Path], + version: str, +) -> dict[Path, dict[str, str | int]]: + """Build metadata for demand files by lookup in the demand mapping. + + The demand YAML is option-keyed, so `_expand_lookup` first expands the + dimensions into a `(location_prefix, dimensions_suffix)`-keyed dict; + each filename then decomposes into those two literal slices (either + side of `_RefYear__`) for a single lookup. + """ + lookup = _expand_lookup(version) + + file_metadata: dict[Path, dict[str, str | int]] = {} + for path in files: + location_prefix, _, after = path.stem.partition("_RefYear_") + refyear, _, dimensions_suffix = after.partition("_") + key = (location_prefix, dimensions_suffix) + if not refyear.isdigit() or key not in lookup: + raise ValueError(f"Unexpected trace filename: {path.name}") + file_metadata[path] = {**lookup[key], "reference_year": int(refyear)} + return file_metadata + + +def _expand_lookup(version: str) -> dict[tuple[str, str], dict[str, str]]: + """Expand the demand dimensions into a refyear-agnostic lookup. + + Keyed by `(location_prefix, dimensions_suffix)` — the two literal + slices of the filename either side of `_RefYear__`. For 2024, + `location_prefix` is the subregion and `dimensions_suffix` is + `__`. `reference_year` is added by `build`. + """ + demand = mappings.load("demand", version=version) + topography = mappings.load("topography", version=version) + + lookup: dict[tuple[str, str], dict[str, str]] = {} + for subregion in topography["subregions"]: + for scenario in demand["scenarios"]: + for poe in demand["poe_levels"]: + for demand_type in demand["demand_types"]: + key = (subregion, f"{scenario}_{poe}_{demand_type}") + lookup[key] = { + "subregion": subregion, + "scenario": scenario, + "poe": poe, + "demand_type": demand_type, + } + return lookup diff --git a/src/isp_trace_parser/demand_traces.py b/src/isp_trace_parser/demand_traces.py index 97cf2d1..9703fe6 100644 --- a/src/isp_trace_parser/demand_traces.py +++ b/src/isp_trace_parser/demand_traces.py @@ -7,8 +7,7 @@ from joblib import Parallel, delayed from pydantic import BaseModel, validate_call -from isp_trace_parser import input_validation, mappings -from isp_trace_parser.metadata_extractors import extract_demand_trace_metadata +from isp_trace_parser import demand_trace_metadata, input_validation, mappings from isp_trace_parser.trace_restructure_helper_functions import ( check_filter_by_metadata, get_all_filepaths, @@ -133,11 +132,13 @@ def parse_demand_traces( parsed_directory = input_validation.parsed_directory(parsed_directory) files = get_all_filepaths(input_directory) + file_metadata = demand_trace_metadata.build(files, version="2024") - demand_scenario_mapping = mappings.load("demand_scenario_mapping") + demand_scenario_mapping = mappings.load("demand", version="2024")["scenarios"] partial_func = functools.partial( restructure_demand_file, + all_input_file_metadata=file_metadata, demand_scenario_mapping=demand_scenario_mapping, output_directory=parsed_directory, filters=filters, @@ -155,6 +156,7 @@ def parse_demand_traces( def restructure_demand_file( input_filepath: Path, + all_input_file_metadata: dict[Path, dict[str, str | int]], demand_scenario_mapping: dict[str, str], output_directory: Path, filters: DemandMetadataFilter | None = None, @@ -168,6 +170,7 @@ def restructure_demand_file( Args: input_filepath: Path object representing the input demand trace file. + all_input_file_metadata: Metadata for all input files. demand_scenario_mapping: Dictionary mapping raw scenario names to IASR workbook scenario names. output_directory: Directory where restructured files will be saved. filters: DemandMetadataFilter or None, specifies which traces to parse based on metadata. @@ -188,7 +191,7 @@ def restructure_demand_file( # This will process the input file and save it in parquet format in the specified output directory """ - file_metadata = extract_demand_trace_metadata(input_filepath.name) + file_metadata = dict(all_input_file_metadata[input_filepath]) file_metadata["scenario"] = get_save_scenario_for_demand_trace( file_metadata, demand_scenario_mapping @@ -255,19 +258,3 @@ def write_new_demand_filename(metadata: dict[str, str]) -> str: scenario = m["scenario"].replace(" ", "_") return f"{scenario}_RefYear{m['reference_year']}_{subregion}_{m['poe']}_{m['demand_type']}.parquet" - - -def extract_metadata_for_all_demand_files( - filenames: list[Path], -) -> dict[Path, dict[str, str]]: - """ - Extracts metadata for all demand trace files. - - Args: - filenames: List of Path objects representing the demand trace files. - - Returns: - A dictionary with filepaths as keys and metadata dicts as values. - """ - file_metadata = [extract_demand_trace_metadata(str(f.name)) for f in filenames] - return dict(zip(filenames, file_metadata)) diff --git a/src/isp_trace_parser/mappings/2024/demand.yaml b/src/isp_trace_parser/mappings/2024/demand.yaml new file mode 100644 index 0000000..57801bd --- /dev/null +++ b/src/isp_trace_parser/mappings/2024/demand.yaml @@ -0,0 +1,21 @@ +# 2024 ISP demand trace metadata +#(based on the 2024 traces and IASR workbook). +# +# 2024 AEMO demand trace filenames take the form: +# _RefYear____.csv +# e.g. CNSW_RefYear_2011_HYDROGEN_EXPORT_POE10_OPSO_MODELLING.csv +# +# Valid filenames are the product of these dimensions (with subregion, +# sourced from topography.yaml, and reference_year). + +# scenarios: raw AEMO filename code -> IASR workbook display name. +# Keys appear in the filename; values are used in output filenames and +# in the `scenario` column of the parsed parquet. +scenarios: + STEP_CHANGE: Step Change + PROGRESSIVE_CHANGE: Progressive Change + HYDROGEN_EXPORT: Green Energy Exports + +poe_levels: [POE10, POE50] + +demand_types: [OPSO_MODELLING, OPSO_MODELLING_PVLITE, PV_TOT] diff --git a/src/isp_trace_parser/mappings/2024/demand_scenario_mapping.yaml b/src/isp_trace_parser/mappings/2024/demand_scenario_mapping.yaml deleted file mode 100644 index de7d1a7..0000000 --- a/src/isp_trace_parser/mappings/2024/demand_scenario_mapping.yaml +++ /dev/null @@ -1,3 +0,0 @@ -HYDROGEN_EXPORT: Green Energy Exports -STEP_CHANGE: Step Change -PROGRESSIVE_CHANGE: Progressive Change diff --git a/src/isp_trace_parser/metadata_extractors.py b/src/isp_trace_parser/metadata_extractors.py deleted file mode 100644 index a99b450..0000000 --- a/src/isp_trace_parser/metadata_extractors.py +++ /dev/null @@ -1,21 +0,0 @@ -import re - - -def extract_demand_trace_metadata(filename): - # Regex pattern to match the structure of the filename - pattern = re.compile( - r"^(?P[A-Z]+)_RefYear_(?P\d{4})_(?P[A-Z_]+)_(?PPOE\d{2})_(?P[" - r"A-Z_]+)\.csv$" - ) - - # Match the pattern against the filename - match = pattern.match(filename) - - if match: - # If the filename matches the pattern, return a dictionary of captured groups - match_data = match.groupdict() - match_data["reference_year"] = int(match_data["reference_year"]) - return match_data - else: - # If the pattern does not match, raise an error or return None - raise ValueError(f"Filename '{filename}' does not match the expected pattern") diff --git a/tests/test_demand_trace_metadata.py b/tests/test_demand_trace_metadata.py new file mode 100644 index 0000000..93a3499 --- /dev/null +++ b/tests/test_demand_trace_metadata.py @@ -0,0 +1,45 @@ +from pathlib import Path + +import pytest + +from isp_trace_parser import demand_trace_metadata + + +def test_build(): + """Two examples spanning different scenario / poe / demand_type / + subregion values. Every combination resolves through the same single + dict lookup, so two are enough for testing. + """ + files = [ + Path("VIC_RefYear_2011_STEP_CHANGE_POE10_OPSO_MODELLING.csv"), + Path("CNSW_RefYear_2023_HYDROGEN_EXPORT_POE50_OPSO_MODELLING_PVLITE.csv"), + ] + metadata = demand_trace_metadata.build(files, version="2024") + + assert metadata[files[0]] == { + "subregion": "VIC", + "reference_year": 2011, + "scenario": "STEP_CHANGE", + "poe": "POE10", + "demand_type": "OPSO_MODELLING", + } + assert metadata[files[1]] == { + "subregion": "CNSW", + "reference_year": 2023, + "scenario": "HYDROGEN_EXPORT", + "poe": "POE50", + "demand_type": "OPSO_MODELLING_PVLITE", + } + + +@pytest.mark.parametrize( + "filename", + [ + "VIC_2011_STEP_CHANGE_POE10_OPSO_MODELLING.csv", # missing _RefYear_ + "VIC_RefYear_201a_STEP_CHANGE_POE10_OPSO_MODELLING.csv", # non-digit year + "VIC_RefYear_2011_MYSTERY_POE10_OPSO_MODELLING.csv", # lookup miss + ], +) +def test_build_rejects_unexpected_filename(filename): + with pytest.raises(ValueError, match="Unexpected trace filename"): + demand_trace_metadata.build([Path(filename)], version="2024") diff --git a/tests/test_trace_file_meta_data_extraction.py b/tests/test_trace_file_meta_data_extraction.py deleted file mode 100644 index 49346cf..0000000 --- a/tests/test_trace_file_meta_data_extraction.py +++ /dev/null @@ -1,11 +0,0 @@ -from isp_trace_parser import metadata_extractors - - -def test_demand_trace_metadata_extraction(): - file_name = "VIC_RefYear_2011_STEP_CHANGE_POE10_OPSO_MODELLING.csv" - metadata = metadata_extractors.extract_demand_trace_metadata(file_name) - assert metadata["subregion"] == "VIC" - assert metadata["reference_year"] == 2011 - assert metadata["scenario"] == "STEP_CHANGE" - assert metadata["poe"] == "POE10" - assert metadata["demand_type"] == "OPSO_MODELLING"