From c465418a4e346eeb87cb4198901294a0ae51eb83 Mon Sep 17 00:00:00 2001 From: HaneenT Date: Fri, 29 Aug 2025 08:34:06 -0400 Subject: [PATCH 01/54] KPMP-5863: raji bulk upload yaml generation script --- data_management/generate_sc_rnaseq_yaml.py | 38 ++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 data_management/generate_sc_rnaseq_yaml.py diff --git a/data_management/generate_sc_rnaseq_yaml.py b/data_management/generate_sc_rnaseq_yaml.py new file mode 100644 index 0000000..ecbcf7e --- /dev/null +++ b/data_management/generate_sc_rnaseq_yaml.py @@ -0,0 +1,38 @@ +import os +import yaml +import sys + +yamlData = { + "package_type": "Single-cell RNA-Seq", + "tis": "Michigan/Broad/Princeton", + "data_generators": "Rajasree Menon", + "dataset_description": "" +} +experiments = [] + +if len(sys.argv) == 1: + print("Error. Please specify directory: python3 generate_sc_rnaseq_yaml.py /path/to/bulk/upload") + exit(1) + +dir = sys.argv[1] +for root, dirs, files in os.walk(dir): + if root == dir: + continue + sample_id = os.path.split(root)[1] + experiment = { + "internal_experiment_id": sample_id, + "files": [] + } + for file in files: + experiment['files'].append({ + 'redcap_id': sample_id, + 'spectrack_sample_id': sample_id, + 'relative_file_path_and_name': sample_id + '/' + file, + 'file_metadata': "" + }) + experiments.append({ + "experiment": experiment + }) +yamlData["experiments"] = experiments +with open(os.path.join(dir, 'bulk-manifest.yaml'), 'w') as file: + yaml.dump(yamlData, file) From f74b9e4d3f16d7a940cf1ffd38f037927f4a43b3 Mon Sep 17 00:00:00 2001 From: rlreamy <34109594+rlreamy@users.noreply.github.com> Date: Wed, 3 Sep 2025 13:53:56 -0400 Subject: [PATCH 02/54] Update changelog.md --- changelog.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/changelog.md b/changelog.md index 5b2eaad..cdaf88a 100644 --- a/changelog.md +++ b/changelog.md @@ -1,14 +1,27 @@ # Changelog +## Release 1.10 [Unreleased] +Breif summary: -## Release 1.9 [unreleased] +### Breaking changes + +### Other changes + +--- +## Release 1.9 [Released 8/2/2025] +Brief summary - Load Tableau database with biopsy_tracker data - Insert dlu_upload_type to dlu_package_inventory table - Create recalled packages endpoint - Update Multi Modal package name - Tweak bulk upload fields +### Breaking changes +- changed column names in tables +- added new columns to dlu_package_inventory table + +--- ## Release 1.8.1 [Released 11/8/2024] Brief summary of what's in this release: From cacb4035636d6263e7535cbf2ef2885c14c1df9e Mon Sep 17 00:00:00 2001 From: rlreamy <34109594+rlreamy@users.noreply.github.com> Date: Wed, 3 Sep 2025 13:54:47 -0400 Subject: [PATCH 03/54] Update rebuild.sh --- data_management/rebuild.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_management/rebuild.sh b/data_management/rebuild.sh index 985f891..5b70a1d 100644 --- a/data_management/rebuild.sh +++ b/data_management/rebuild.sh @@ -1,2 +1,2 @@ python3 setup.py install --user -docker build -t kingstonduo/data-management:1.8.1 . +docker build -t kingstonduo/data-management:1.10 . From 756b92376069adb9f071bfe948f9fa81fc6a4e9d Mon Sep 17 00:00:00 2001 From: rlreamy <34109594+rlreamy@users.noreply.github.com> Date: Wed, 3 Sep 2025 14:08:47 -0400 Subject: [PATCH 04/54] Update build-libra.yml --- .github/workflows/build-libra.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-libra.yml b/.github/workflows/build-libra.yml index 0232776..ecff190 100644 --- a/.github/workflows/build-libra.yml +++ b/.github/workflows/build-libra.yml @@ -6,7 +6,7 @@ on: jobs: docker: env: - IMAGE_TAG: "1.8.1" + IMAGE_TAG: "1.10" runs-on: ubuntu-latest steps: - name: Get branch names From a06b99a7072937678900062d844dd853022a61f9 Mon Sep 17 00:00:00 2001 From: rlreamy <34109594+rlreamy@users.noreply.github.com> Date: Thu, 4 Sep 2025 13:21:27 -0400 Subject: [PATCH 05/54] Update changelog.md --- changelog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog.md b/changelog.md index cdaf88a..0d5b047 100644 --- a/changelog.md +++ b/changelog.md @@ -9,7 +9,7 @@ Breif summary: --- -## Release 1.9 [Released 8/2/2025] +## Release 1.9 [Released 9/2/2025] Brief summary - Load Tableau database with biopsy_tracker data - Insert dlu_upload_type to dlu_package_inventory table From cdba63eada5b9fd64a0aa66bcb1bd1caccf644bf Mon Sep 17 00:00:00 2001 From: Becky Reamy Date: Mon, 15 Sep 2025 15:30:25 -0400 Subject: [PATCH 06/54] KPMP-6216: skip calc checksums for package recall --- data_management/Dockerfile | 2 +- data_management/app.py | 8 ++++---- data_management/services/dlu_filesystem.py | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/data_management/Dockerfile b/data_management/Dockerfile index ba017e7..1fe48b1 100644 --- a/data_management/Dockerfile +++ b/data_management/Dockerfile @@ -31,5 +31,5 @@ COPY app.py ./ COPY process_bulk_uploads.py ./ COPY services/ ./services -ENTRYPOINT ["gunicorn", "-b", ":5000", "app:app"] +ENTRYPOINT ["gunicorn", "-b", ":5000", "app:app", "-t", "600"] diff --git a/data_management/app.py b/data_management/app.py index e0507a6..c8a649e 100644 --- a/data_management/app.py +++ b/data_management/app.py @@ -82,7 +82,7 @@ def recall_dlu_package(package_id): return error_msg dlu_data_directory = '/data/package_' + package_id - directory_info = DirectoryInfo(dlu_data_directory) + directory_info = DirectoryInfo(dlu_data_directory, calculate_checksums = False) file_list = None if directory_info.file_count == 0 and directory_info.subdir_count == 0: error_msg = "Error: package " + package_id + " has no files or top level subdirectory" @@ -92,9 +92,9 @@ def recall_dlu_package(package_id): if directory_info.file_count == 0 and directory_info.subdir_count == 1: contents = "".join(directory_info.dir_contents) top_level_subdir = package_id + "/" + contents - file_list = dlu_file_handler.match_files(top_level_subdir) + file_list = dlu_file_handler.match_files(top_level_subdir,False) else: - file_list = dlu_file_handler.match_files(package_id) + file_list = dlu_file_handler.match_files(package_id,False) dlu_files = [] for file in directory_info.file_details: @@ -117,4 +117,4 @@ def get_package_status(package_id): dlu_package_inventory = DLUPackageInventory() dlu_package_inventory.reconnect() status = dlu_package_inventory.get_package_status(package_id) - return status[0]["globus_dlu_status"] if len(status) > 0 and status[0]["globus_dlu_status"] is not None else "" \ No newline at end of file + return status[0]["globus_dlu_status"] if len(status) > 0 and status[0]["globus_dlu_status"] is not None else "" diff --git a/data_management/services/dlu_filesystem.py b/data_management/services/dlu_filesystem.py index 82dd329..8323144 100644 --- a/data_management/services/dlu_filesystem.py +++ b/data_management/services/dlu_filesystem.py @@ -202,13 +202,13 @@ def process_globus_directory(self, directoryListing, globusDirectories: list[Dir self.process_globus_directory(directoryListing, globusDirectories, packageId, currentDir) return directoryListing - def match_files(self, packageId) -> list[DLUFile]: + def match_files(self, packageId, calculate_checksums: bool = True) -> list[DLUFile]: topLevelDir = DirectoryInfo(self.globus_data_directory + '/' + self.globus_dir_prefix + packageId) globusFiles = [] globusDirectories = [] for obj in topLevelDir.file_details: if os.path.isdir(obj.path): - directory = DirectoryInfo(obj.path) + directory = DirectoryInfo(obj.path, calculate_checksums = calculate_checksums) globusDirectories.append(directory) else: globusFiles.append(obj) @@ -225,4 +225,4 @@ def get_globus_file_paths(self, filesInGlobusDirectories: dict[str, list[DLUFile prefix = dir + "/" if dir else "" file.name = prefix + file.name fileList.append(file) - return fileList \ No newline at end of file + return fileList From 6b6dac62232a1ecbfeb64ec550f6b42a3727a9fe Mon Sep 17 00:00:00 2001 From: Becky Reamy Date: Tue, 16 Sep 2025 09:50:13 -0400 Subject: [PATCH 07/54] KPMP-6216: additional fixes --- .java-version | 1 + data_management/services/dlu_filesystem.py | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) create mode 100644 .java-version diff --git a/.java-version b/.java-version new file mode 100644 index 0000000..b4de394 --- /dev/null +++ b/.java-version @@ -0,0 +1 @@ +11 diff --git a/data_management/services/dlu_filesystem.py b/data_management/services/dlu_filesystem.py index 8323144..7cf69d7 100644 --- a/data_management/services/dlu_filesystem.py +++ b/data_management/services/dlu_filesystem.py @@ -183,7 +183,7 @@ def validate_package_directories(self, package_id: str): logger.error("Directory for package " + package_id + " failed validation.") return success - def process_globus_directory(self, directoryListing, globusDirectories: list[DirectoryInfo], packageId, initialDir): + def process_globus_directory(self, directoryListing, globusDirectories: list[DirectoryInfo], packageId, initialDir, calculate_checksums: bool = True): for dir in globusDirectories: prefix = "" if not initialDir == "": @@ -194,16 +194,16 @@ def process_globus_directory(self, directoryListing, globusDirectories: list[Dir globusDirectories = [] for item in dir.file_details: if os.path.isdir(item.path): - globusDirectories.append(DirectoryInfo(item.path)) + globusDirectories.append(DirectoryInfo(item.path, calculate_checksums = calculate_checksums)) else: globusFiles.append(item) directoryListing[currentDir] = globusFiles if len(globusDirectories) > 0: - self.process_globus_directory(directoryListing, globusDirectories, packageId, currentDir) + self.process_globus_directory(directoryListing, globusDirectories, packageId, currentDir,calculate_checksums) return directoryListing def match_files(self, packageId, calculate_checksums: bool = True) -> list[DLUFile]: - topLevelDir = DirectoryInfo(self.globus_data_directory + '/' + self.globus_dir_prefix + packageId) + topLevelDir = DirectoryInfo(self.globus_data_directory + '/' + self.globus_dir_prefix + packageId, calculate_checksums = calculate_checksums) globusFiles = [] globusDirectories = [] for obj in topLevelDir.file_details: @@ -215,7 +215,7 @@ def match_files(self, packageId, calculate_checksums: bool = True) -> list[DLUFi filesInGlobusDirectories = {} filesInGlobusDirectories[""] = globusFiles currentDir = "" - filesInGlobusDirectories = self.process_globus_directory(filesInGlobusDirectories, globusDirectories, packageId, currentDir) + filesInGlobusDirectories = self.process_globus_directory(filesInGlobusDirectories, globusDirectories, packageId, currentDir, calculate_checksums) return self.get_globus_file_paths(filesInGlobusDirectories) def get_globus_file_paths(self, filesInGlobusDirectories: dict[str, list[DLUFile]]) -> list[DLUFile]: From 9fba062759b3621d4a79e2f43fc98b1e7eb408af Mon Sep 17 00:00:00 2001 From: Becky Reamy Date: Mon, 22 Sep 2025 11:10:34 -0400 Subject: [PATCH 08/54] KPMP-6197: Fixes to the bulk uploader to be up to speed on changes to system --- data_management/BulkUploader | 25 +++++++++++++++++++++++++ data_management/lib/mysql_connection.py | 4 ++-- data_management/model/dlu_package.py | 2 ++ data_management/process_bulk_uploads.py | 13 ++++++++----- data_management/services/dlu_mongo.py | 2 +- 5 files changed, 38 insertions(+), 8 deletions(-) create mode 100644 data_management/BulkUploader diff --git a/data_management/BulkUploader b/data_management/BulkUploader new file mode 100644 index 0000000..355cfb5 --- /dev/null +++ b/data_management/BulkUploader @@ -0,0 +1,25 @@ +FROM python:3.10-slim-bullseye + +WORKDIR /usr/src/app + +ENV FLASK_APP=app.py +ENV FLASK_RUN_HOST=0.0.0.0 + +RUN apt-get update \ + && apt-get install -y curl + +COPY requirements.txt ./ + +RUN pip3 config --user set global.progress_bar off +RUN pip3 install --no-cache-dir -r requirements.txt +RUN pip3 install -U flask-cors + +COPY lib/ ./lib +COPY main.py ./ +COPY app.py ./ +COPY process_bulk_uploads.py ./ +COPY services/ ./services +COPY model/ ./model +COPY .env ./.env + +ENTRYPOINT [] \ No newline at end of file diff --git a/data_management/lib/mysql_connection.py b/data_management/lib/mysql_connection.py index 302ec9f..f3d67f7 100644 --- a/data_management/lib/mysql_connection.py +++ b/data_management/lib/mysql_connection.py @@ -86,7 +86,7 @@ def get_db_connection(self): self.database.get_warnings = True return self.database except Exception as error: - logger.error("Can't connect to MySQL: ", exec_info=error) + logger.exception("Can't connect to MySQL: ", error) os.sys.exit() def get_tableau_db_connection(self): @@ -102,7 +102,7 @@ def get_tableau_db_connection(self): self.database.get_warnings = True return self.database except Exception as error: - logger.error("Can't connect to MySQL: ", exc_info=error) + logger.exception("Can't connect to MySQL: ", error) os.sys.exit() def insert_data(self, sql, data): diff --git a/data_management/model/dlu_package.py b/data_management/model/dlu_package.py index 7c74e7b..649112a 100644 --- a/data_management/model/dlu_package.py +++ b/data_management/model/dlu_package.py @@ -19,6 +19,7 @@ def __init__(self): self.dlu_protocol = None self.dlu_data_generators = None self.dlu_files = [] + self.dlu_upload_type = None self.submitter_name = None self.known_specimen = None self.redcap_id = None @@ -61,6 +62,7 @@ def get_dmd_dpi_tuple(self): self.dlu_subject_id, self.dlu_error, self.dlu_lfu, + self.dlu_upload_type, self.globus_dlu_status ) diff --git a/data_management/process_bulk_uploads.py b/data_management/process_bulk_uploads.py index f449316..60b50cd 100644 --- a/data_management/process_bulk_uploads.py +++ b/data_management/process_bulk_uploads.py @@ -27,8 +27,8 @@ class ProcessBulkUploads: def __init__(self, data_directory: str, globus_only: bool = False, globus_root: str = None, preserve_path: bool = False, bypass_dup_check: bool = False): try: self.dlu_management = DluManagement() - except: - logger.error("There was a problem loading the Data Management library.") + except Exception as e: + logger.exception("There was a problem loading the Data Management library.", e) try: self.submitter = os.environ["mongo_submitter_id"] self.submitter_name = os.environ["submitter_name"] @@ -68,12 +68,12 @@ def process_files(self, manifest_files_arr: list) -> list: logger.info(file_full_path) size = os.path.getsize(file_full_path) file_info = self.dlu_file_handler.split_path(file_path, self.preserve_path) - if file["file_metadata"] and "md5_hash" in file["file_metadata"]: + if "file_metadata" in file and "md5_hash" in file["file_metadata"]: checksum = file["file_metadata"]["md5_hash"] del file["file_metadata"]["md5_hash"] else: checksum = calculate_checksum(file_full_path) - if file["file_metadata"]: + if "file_metadata" in file: metadata = file["file_metadata"] else: metadata = {} @@ -82,6 +82,7 @@ def process_files(self, manifest_files_arr: list) -> list: return dlu_files def process_bulk_uploads(self): + logger.info("in process bulk uploads") for manifest_name in MANIFEST_FILE_NAMES: manifest_file_path = os.path.join(self.data_directory, manifest_name) if os.path.isfile(manifest_file_path): @@ -93,13 +94,14 @@ def process_bulk_uploads(self): manifest_data = yaml.safe_load(stream) if manifest_data["package_type"] == "EM Images": package_type = PackageType.ELECTRON_MICROSCOPY - elif manifest_data["package_type"] == "Segmentation Masks": + elif manifest_data["package_type"] == "Segmentation Masks & Pathomics Vectors": package_type = PackageType.SEGMENTATION elif manifest_data["package_type"] == "Multimodal Images": package_type = PackageType.MULTI_MODAL elif manifest_data["package_type"] == "Single-cell RNA-Seq": package_type = PackageType.SINGLE_CELL else: + logger.info("package type is: ", manifest_data["package_type"]) package_type = PackageType.OTHER if "tis" in manifest_data: tis = manifest_data["tis"] @@ -150,6 +152,7 @@ def process_bulk_uploads(self): package.dlu_version = 4 package.dlu_dataset_information_version = 1 package.dlu_error = 0 + package.dlu_upload_type = 'KPMP Biopsy'; if self.globus_only: package.globus_dlu_status = None else: diff --git a/data_management/services/dlu_mongo.py b/data_management/services/dlu_mongo.py index 8830300..f1ef6e7 100644 --- a/data_management/services/dlu_mongo.py +++ b/data_management/services/dlu_mongo.py @@ -15,7 +15,7 @@ class PackageType(Enum): ELECTRON_MICROSCOPY = "Electron Microscopy Imaging" - SEGMENTATION = "Segmentation Masks" + SEGMENTATION = "Segmentation Masks & Pathomics Vectors" MULTI_MODAL = "Multimodal Imaging Mass Spectrometry" SINGLE_CELL = "Single-cell RNA-Seq" OTHER = "Other" From 8c31a1a4aac505a95ac63a09cc7f081c82bbe2de Mon Sep 17 00:00:00 2001 From: Becky Reamy Date: Mon, 22 Sep 2025 14:03:02 -0400 Subject: [PATCH 09/54] KPMP-5806: add new slide name --- data_management/model/slide_scan_model.py | 15 +++++ data_management/services/dlu_management.py | 31 ++++++--- data_management/services/slide_management.py | 66 ++++++++++++++++++++ data_management/watch_files.py | 9 +-- 4 files changed, 108 insertions(+), 13 deletions(-) create mode 100644 data_management/model/slide_scan_model.py create mode 100644 data_management/services/slide_management.py diff --git a/data_management/model/slide_scan_model.py b/data_management/model/slide_scan_model.py new file mode 100644 index 0000000..67d24d7 --- /dev/null +++ b/data_management/model/slide_scan_model.py @@ -0,0 +1,15 @@ +class SlideScanModel: + + def __init__(self, image_id: str, redcap_id: str, kit_id:str, new_file_name: str): + self.image_id = image_id + self.redcap_id = redcap_id + self.kit_id = kit_id + self.new_file_name = new_file_name + + def get_dmd_tuple(self): + return( + self.image_id, + self.kit_id, + self.redcap_id, + self.new_file_name + ) diff --git a/data_management/services/dlu_management.py b/data_management/services/dlu_management.py index 2a59102..aad06d1 100644 --- a/data_management/services/dlu_management.py +++ b/data_management/services/dlu_management.py @@ -189,9 +189,13 @@ def get_biopsy_tracking(self): def get_data_manager_data(self): result = self.db.get_data( """ - select dm.id, dm.dlu_package_id, dm.dlu_created, dm.dlu_submitter, dm.dlu_tis, dm.dlu_packageType, dm.dlu_subject_id, dm.dlu_error, dm.redcap_id, dm.known_specimen, dm.user_package_ready, dm.package_validated, dm.ready_to_move_from_globus, dm.globus_dlu_status, dm.package_status, dm.current_owner, dm.ar_promotion_status, dm.sv_promotion_status, dm.release_version, r.release_date, dm.removed_from_globus, dm.notes - from data_manager_data_v dm - left outer join `release` r on dm.release_version = r.release_version + SELECT dm.id, dm.dlu_package_id, dm.dlu_created, dm.dlu_submitter, dm.dlu_tis, dm.dlu_packageType, + dm.dlu_subject_id, dm.dlu_error, dm.redcap_id, dm.known_specimen, dm.user_package_ready, + dm.package_validated, dm.ready_to_move_from_globus, dm.globus_dlu_status, dm.package_status, + dm.current_owner, dm.ar_promotion_status, dm.sv_promotion_status, dm.release_version, r.release_date, + dm.removed_from_globus, dm.notes + FROM data_manager_data_v dm + LEFT OUTER JOIN `release` r on dm.release_version = r.release_version """ ) return result @@ -210,24 +214,37 @@ def delete_files_by_package_id(self, package_id: str): return self.db.get_data("DELETE FROM dlu_file WHERE dlu_package_id = %s", (package_id,)) def get_equal_num_rows(self): - result = self.db.get_data("SELECT (SELECT COUNT(*) FROM slide_manifest_import) = (SELECT COUNT(*) FROM slide_scan_curation) AS equal_num_rows") + result = self.db.get_data("SELECT (SELECT COUNT(*) FROM slide_manifest_import) = " + "(SELECT COUNT(*) FROM slide_scan_curation) AS equal_num_rows") return result[0]["equal_num_rows"] def get_new_slide_manifest_import_rows(self): - return self.db.get_data("SELECT * FROM slide_manifest_import WHERE image_id NOT IN (SELECT image_id FROM slide_scan_curation)") + return self.db.get_data("SELECT * FROM slide_manifest_import WHERE image_id NOT IN " + "(SELECT image_id FROM slide_scan_curation)") def get_spectrack_redcap_record_id(self, kit_id): - result = self.db.get_data("SELECT spectrack_redcap_record_id FROM spectrack_specimen WHERE spectrack_specimen_kit_id = %s LIMIT 1", (kit_id,)) + result = self.db.get_data("SELECT spectrack_redcap_record_id FROM spectrack_specimen " + "WHERE spectrack_specimen_kit_id = %s LIMIT 1", (kit_id,)) if len(result) > 0 and "spectrack_redcap_record_id" in result[0]: return result[0]["spectrack_redcap_record_id"] else: return None def insert_into_slide_scan_curation(self, values): - query = "INSERT INTO slide_scan_curation (image_id, kit_id, redcap_id) VALUES (%s, %s, %s)" + query = "INSERT INTO slide_scan_curation (image_id, kit_id, redcap_id, new_file_name) VALUES (%s, %s, %s, %s)" self.db.insert_data(query, values) return query % values + def get_slide_manifest_import_by_kit(self, kit_id, stain): + return self.db.get_data("SELECT * FROM slide_manifest_import WHERE outside_acc= %s AND stain = %s" + "AND block_id NOT 'OCT' ORDER BY stain, block_id", + (kit_id,stain,)) + + def set_error_message_slide_scan_curation(self, error, image_id): + self.db.insert_data("UPDATE slide_scan_curation set error_message = %s where image_id = %s", + (error, image_id,)) + + if __name__ == "__main__": dlu_management = DluManagement() dlu_management.get_data_management_tables() diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py new file mode 100644 index 0000000..0e06421 --- /dev/null +++ b/data_management/services/slide_management.py @@ -0,0 +1,66 @@ +from dlu_management import DluManagement +from ..model.slide_scan_model import SlideScanModel +import logging + +logger = logging.getLogger("services-dlu_package_watcher") +logger.setLevel(logging.INFO) + + +def determine_stain(stain_info, block_id): + if stain_info == "H&E": + if block_id != "OCT": + return "HE" + elif block_id == "OCT": + return "FRZ" + elif stain_info == "TRICHRM": + return "TRI" + elif stain_info == "PAS": + return "PAS" + elif stain_info == "Toluidine Blue": + return "TOL" + elif stain_info == "Jones Methenamine Silver (SIL)": + return "SIL" + return None + + +class SlideManagement: + def __init__(self, db: DluManagement = None): + if db: + self.db = db + else: + self.db = DluManagement() + + def process_slide_manifest_imports(self): + new_records = self.db.get_new_slide_manifest_import_rows() + for record in new_records: + kit_id = record["outside_acc"] + image_id = record["image_id"] + redcap_id = self.db.get_spectrack_redcap_record_id(kit_id) + new_file_name = self.determine_new_slide_name(record["barcode_id"], kit_id, record["stain"], + record["block_id"]) + slide_scan = SlideScanModel(image_id=image_id, redcap_id=redcap_id, kit_id=kit_id, + new_file_name=new_file_name) + self.db.insert_into_slide_scan_curation(slide_scan.get_dmd_tuple()) + + # If we were unable to determine the filename, we want to update this with an error message, after a + # record is in slide_scan_curation + if new_file_name is None: + self.db.set_error_message_slide_scan_curation("Unknown stain type", image_id=image_id) + + def determine_new_slide_name(self, sample_id: str, kit_id: str, stain_info: str, block_id: str): + slides_for_kit = self.db.get_slide_manifest_import_by_kit(kit_id, stain_info) + denominator = len(slides_for_kit) + numerator = 1 + + # Keep counting until we find this slide + for slide in slides_for_kit: + if slide['barcode_id'] != sample_id: + numerator = numerator + 1 + stain_type = determine_stain(stain_info, block_id) + + # If we are unable to determine the stain type, we will leave the new filename blank + if stain_type is None: + logger.info("Unable to determine stain type from stain: " + stain_info + " and block_id: " + block_id) + return None + else: + return sample_id + "_" + stain_type + "_" + str(numerator) + "of" + str(denominator) \ No newline at end of file diff --git a/data_management/watch_files.py b/data_management/watch_files.py index 9b9d866..c5ae0dd 100644 --- a/data_management/watch_files.py +++ b/data_management/watch_files.py @@ -5,6 +5,7 @@ from services.dlu_management import DluManagement from model.dlu_package import DLUPackage from services.dlu_mongo import DLUMongo +from services.slide_management import SlideManagement from dotenv import load_dotenv import logging @@ -28,6 +29,7 @@ def __init__ (self, db: DLUPackageInventory = None): self.dlu_file_handler = DLUFileHandler() self.dluPackage = DLUPackage() self.dlu_state = DLUState() + self.slide_management = SlideManagement(self.dlu_management) def watch_for_files(self): files = self.db.get_dlu_file("yes") @@ -48,12 +50,7 @@ def watch_for_side_manifest_records(self): def update_slide_scan_curation(self): logger.info("Importing new row(s) into slide_scan_curation") - new_records = self.dlu_management.get_new_slide_manifest_import_rows() - for record in new_records: - redcap_id = self.dlu_management.get_spectrack_redcap_record_id(record["outside_acc"]) - slide_scan_tuple = (record["image_id"], record["outside_acc"], redcap_id) - query_string = self.dlu_management.insert_into_slide_scan_curation(slide_scan_tuple) - logger.info(query_string) + self.slide_management.process_slide_manifest_imports() def update_files_for_globus(self, files): for index, file_result in enumerate(files): From 86e05791bba067dc5d8889817db514c14f750262 Mon Sep 17 00:00:00 2001 From: Becky Reamy Date: Mon, 22 Sep 2025 15:40:11 -0400 Subject: [PATCH 10/54] KPMP-5806: Get the slide names --- data_management/DluWatcher | 1 + data_management/model/__init__.py | 0 data_management/model/slide_scan_model.py | 15 ----- data_management/services/dlu_management.py | 4 +- data_management/services/slide_management.py | 68 ++++++++++++++------ 5 files changed, 53 insertions(+), 35 deletions(-) delete mode 100644 data_management/model/__init__.py delete mode 100644 data_management/model/slide_scan_model.py diff --git a/data_management/DluWatcher b/data_management/DluWatcher index fa8cada..4818e3f 100644 --- a/data_management/DluWatcher +++ b/data_management/DluWatcher @@ -10,6 +10,7 @@ COPY ./services/dlu_filesystem.py ./services/dlu_filesystem.py COPY ./services/dlu_package_inventory.py ./services/dlu_package_inventory.py COPY ./services/dlu_state.py ./services/dlu_state.py COPY ./services/dlu_management.py ./services/dlu_management.py +COPY ./services/slide_management.py ./services/slide_management.py COPY ./services/dlu_mongo.py ./services/dlu_mongo.py COPY ./model ./model COPY ./watch_files.py ./ diff --git a/data_management/model/__init__.py b/data_management/model/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/data_management/model/slide_scan_model.py b/data_management/model/slide_scan_model.py deleted file mode 100644 index 67d24d7..0000000 --- a/data_management/model/slide_scan_model.py +++ /dev/null @@ -1,15 +0,0 @@ -class SlideScanModel: - - def __init__(self, image_id: str, redcap_id: str, kit_id:str, new_file_name: str): - self.image_id = image_id - self.redcap_id = redcap_id - self.kit_id = kit_id - self.new_file_name = new_file_name - - def get_dmd_tuple(self): - return( - self.image_id, - self.kit_id, - self.redcap_id, - self.new_file_name - ) diff --git a/data_management/services/dlu_management.py b/data_management/services/dlu_management.py index aad06d1..0ba6e13 100644 --- a/data_management/services/dlu_management.py +++ b/data_management/services/dlu_management.py @@ -236,8 +236,8 @@ def insert_into_slide_scan_curation(self, values): return query % values def get_slide_manifest_import_by_kit(self, kit_id, stain): - return self.db.get_data("SELECT * FROM slide_manifest_import WHERE outside_acc= %s AND stain = %s" - "AND block_id NOT 'OCT' ORDER BY stain, block_id", + return self.db.get_data("SELECT * FROM slide_manifest_import WHERE outside_acc= %s AND stain = %s " + "ORDER BY stain, block_id", (kit_id,stain,)) def set_error_message_slide_scan_curation(self, error, image_id): diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py index 0e06421..dec541a 100644 --- a/data_management/services/slide_management.py +++ b/data_management/services/slide_management.py @@ -1,11 +1,26 @@ -from dlu_management import DluManagement -from ..model.slide_scan_model import SlideScanModel import logging logger = logging.getLogger("services-dlu_package_watcher") logger.setLevel(logging.INFO) +class SlideScanModel: + + def __init__(self, image_id: str, redcap_id: str, kit_id: str, new_file_name: str): + self.image_id = image_id + self.redcap_id = redcap_id + self.kit_id = kit_id + self.new_file_name = new_file_name + + def get_dmd_tuple(self): + return ( + self.image_id, + self.kit_id, + self.redcap_id, + self.new_file_name + ) + + def determine_stain(stain_info, block_id): if stain_info == "H&E": if block_id != "OCT": @@ -22,13 +37,33 @@ def determine_stain(stain_info, block_id): return "SIL" return None +def calculate_denominator(slides_for_kit, block_id): + denominator = 0 + for slide in slides_for_kit: + if block_id != 'OCT' and slide['block_id'] != 'OCT': + denominator = denominator + 1 + elif block_id == 'OCT' and slide['block_id'] == 'OCT': + denominator = denominator + 1 + return denominator -class SlideManagement: - def __init__(self, db: DluManagement = None): - if db: - self.db = db + +def calculate_numerator(block_id, sample_id, slides_for_kit): + numerator = 1 + # Keep counting until we find this slide + for slide in slides_for_kit: + if slide['barcode_id'] != sample_id: + if block_id != 'OCT' and slide['block_id'] != 'OCT': + numerator = numerator + 1 + elif block_id == 'OCT' and slide['block_id'] == 'OCT': + numerator = numerator + 1 else: - self.db = DluManagement() + break + return numerator + + +class SlideManagement: + def __init__(self, db): + self.db = db def process_slide_manifest_imports(self): new_records = self.db.get_new_slide_manifest_import_rows() @@ -36,8 +71,8 @@ def process_slide_manifest_imports(self): kit_id = record["outside_acc"] image_id = record["image_id"] redcap_id = self.db.get_spectrack_redcap_record_id(kit_id) - new_file_name = self.determine_new_slide_name(record["barcode_id"], kit_id, record["stain"], - record["block_id"]) + new_file_name = self.determine_new_slide_name(sample_id=record["barcode_id"], kit_id=kit_id, + stain_info=record["stain"], block_id=record["block_id"]) slide_scan = SlideScanModel(image_id=image_id, redcap_id=redcap_id, kit_id=kit_id, new_file_name=new_file_name) self.db.insert_into_slide_scan_curation(slide_scan.get_dmd_tuple()) @@ -49,18 +84,15 @@ def process_slide_manifest_imports(self): def determine_new_slide_name(self, sample_id: str, kit_id: str, stain_info: str, block_id: str): slides_for_kit = self.db.get_slide_manifest_import_by_kit(kit_id, stain_info) - denominator = len(slides_for_kit) - numerator = 1 - - # Keep counting until we find this slide - for slide in slides_for_kit: - if slide['barcode_id'] != sample_id: - numerator = numerator + 1 - stain_type = determine_stain(stain_info, block_id) + denominator = calculate_denominator(slides_for_kit, block_id) + + numerator = calculate_numerator(block_id, sample_id, slides_for_kit) + + stain_type = determine_stain(stain_info, block_id) # If we are unable to determine the stain type, we will leave the new filename blank if stain_type is None: logger.info("Unable to determine stain type from stain: " + stain_info + " and block_id: " + block_id) return None else: - return sample_id + "_" + stain_type + "_" + str(numerator) + "of" + str(denominator) \ No newline at end of file + return sample_id + "_" + stain_type + "_" + str(numerator) + "of" + str(denominator) + ".svs" From 66ad3c41180496702dc9b93734ae843289253ba5 Mon Sep 17 00:00:00 2001 From: Becky Reamy Date: Thu, 25 Sep 2025 14:22:00 -0400 Subject: [PATCH 11/54] KPMP-6197: Updated the scripts to work with changes to the app --- data_management/.BulkUploader.swp | Bin 0 -> 12288 bytes data_management/process_bulk_uploads.py | 33 ++++++++++++++++++--- data_management/services/dlu_filesystem.py | 7 ++++- data_management/services/dlu_state.py | 2 +- 4 files changed, 36 insertions(+), 6 deletions(-) create mode 100644 data_management/.BulkUploader.swp diff --git a/data_management/.BulkUploader.swp b/data_management/.BulkUploader.swp new file mode 100644 index 0000000000000000000000000000000000000000..05aea8a03e880f949ba529254144a8355d937d64 GIT binary patch literal 12288 zcmeI2KabNe6u^B9YzGJi#P$Hu2`4Fq0i+uaxD(~Lw(7Nqjz6m0)J>y0iLsp%U0{K4 zzyd29+X?XrV1f<40EjQbOWHzp;?Rwp^c?--*w1;ff14HegYEhzT=%LB%PGdb?R_1+ zyu5c@>Lq5Bks_HCk$)ED5)lT~z*AW$)WJia%@X=i?EJA#w6Eg65quz=@QYh75vhn| zB2(wz%+lEtAOmE843GgbKnBPF86X2>fDDjN(_y0dmGWH$$f_z3kAa9V@$O}Xw17riai=0J%VvbM9N8~;77I}p@ zmdTtluR#rG~DRLB4sAOmE843GgbKnBPF86X2>;D0i(*66f%TFrW+0}mfFcB|Ri z0Z*p8;Q2*0?MSkLR6cf<57Ry?&2Ah@TXfH#rx9&3?6p7ZYI*-h*Z`W%ES3brd)t9kPU}3EBiAQ>tTSUQUReIk1)gU zX-*)_jVeDjkL4N*ji&7*6*!uZS-(`y*tgV&Pw2w*FtEu)moXFV(fmiy>YwV l20ypPw=zH1&gTFC literal 0 HcmV?d00001 diff --git a/data_management/process_bulk_uploads.py b/data_management/process_bulk_uploads.py index 60b50cd..5bc1459 100644 --- a/data_management/process_bulk_uploads.py +++ b/data_management/process_bulk_uploads.py @@ -81,6 +81,27 @@ def process_files(self, manifest_files_arr: list) -> list: dlu_files.append(dlu_file) return dlu_files + def process_globus_only_files(self, manifest_files_arr: list) -> list: + logger.info("globus only file processing") + files = [] + for file in manifest_files_arr: + file_path = file["relative_file_path_and_name"] + file_full_path = os.path.join(self.data_directory, file_path) + file_info = self.dlu_file_handler.split_path(file_path, self.preserve_path) + if "file_metadata" in file and "md5_hash" in file["file_metadata"]: + checksum = file["file_metadata"]["md5_hash"] + del file["file_metadata"]["md5_hash"] + if "file_metadata" in file: + metadata = file["file_metadata"] + else: + metadata = {} + + # Since this is going directly to globus, we don't need to calc checksum or filesize, and we need + # the path to the file on disk to actually copy it + dlu_file = DLUFile(file_info["file_name"], file_full_path, '', 0, metadata) + files.append(dlu_file) + return files + def process_bulk_uploads(self): logger.info("in process bulk uploads") for manifest_name in MANIFEST_FILE_NAMES: @@ -113,12 +134,13 @@ def process_bulk_uploads(self): redcap_id = experiment["files"][0]["redcap_id"] sample_id = experiment["files"][0]["spectrack_sample_id"] if redcap_id and redcap_id.startswith("S-"): + logger.info("found redcap id starting with S-") sample_id = redcap_id redcap_results = self.dlu_management.get_redcapid_by_subjectid(sample_id) - if redcap_results is not None and len(redcap_results) == 1: + if redcap_results is not None and len(redcap_results) > 1: redcap_id = redcap_results else: - redcap_id = "" + redcap_id = None if not sample_id: sample_id = redcap_id @@ -126,7 +148,10 @@ def process_bulk_uploads(self): if (sample_id and len(self.dlu_management.get_participant_by_redcap_id(redcap_id)) > 0) or \ (self.globus_only and sample_id): logger.info(f"Trying to add package for {redcap_id} / {sample_id}") - dlu_file_list = self.process_files(experiment["files"]) + if self.globus_only: + dlu_file_list = self.process_globus_only_files(experiment["files"]) + else: + dlu_file_list = self.process_files(experiment["files"]) if package_type == PackageType.SEGMENTATION: dlu_file_list.append(self.get_single_file(SEGMENTATION_README)) tis = "UFL" @@ -152,7 +177,7 @@ def process_bulk_uploads(self): package.dlu_version = 4 package.dlu_dataset_information_version = 1 package.dlu_error = 0 - package.dlu_upload_type = 'KPMP Biopsy'; + package.dlu_upload_type = 'KPMP Biopsy' if self.globus_only: package.globus_dlu_status = None else: diff --git a/data_management/services/dlu_filesystem.py b/data_management/services/dlu_filesystem.py index 7cf69d7..1285dcf 100644 --- a/data_management/services/dlu_filesystem.py +++ b/data_management/services/dlu_filesystem.py @@ -121,8 +121,10 @@ def copy_files(self, package_id: str, file_list: list[DLUFile], preserve_path: b if os.path.exists(dest_package_directory): shutil.rmtree(dest_package_directory) for file in file_list: + source_package_directory = self.globus_data_directory + '/' + self.globus_dir_prefix # I.e. isn't a bulk upload that doesn't already have a package ID. + logger.info(source_package_directory) if not no_src_package: source_package_directory = source_package_directory + package_id if file.path and os.path.isdir(file.path): @@ -136,7 +138,6 @@ def copy_files(self, package_id: str, file_list: list[DLUFile], preserve_path: b if os.path.isdir(os.path.join(source_package_directory, o))] dir = "".join(subdirs) if len(os.listdir(source_package_directory)) == 1 and os.path.isdir(source_package_directory) and os.path.isdir(dir): - os.chdir(dir) allfiles = os.listdir(dir) for f in allfiles: @@ -167,6 +168,10 @@ def copy_files(self, package_id: str, file_list: list[DLUFile], preserve_path: b elif os.path.isfile(source_file): logger.info("Copying file to " + dest_file) shutil.copy(source_file, dest_file) + else: + source_file = os.path.join(source_package_directory, file.path) + logger.info("Copying file to " + dest_file) + shutil.copy(source_file, dest_file) files_copied = files_copied + 1 else: logger.warning(dest_file + " already exists. Skipping.") diff --git a/data_management/services/dlu_state.py b/data_management/services/dlu_state.py index 26edbcd..af92949 100644 --- a/data_management/services/dlu_state.py +++ b/data_management/services/dlu_state.py @@ -41,7 +41,7 @@ def set_package_state(self, package_id: str, state: PackageState, codicil = None if e and e.strerror: logger.error("There was an error updating the state: " + e.strerror) else: - logger.error("There was an error updating the state.") + logger.exception("There was a problem updating state", e) def clear_cache(self): requests.get(self.cache_clear_url) From b267ecdc30f2a323ca948268a440d53d54d2656c Mon Sep 17 00:00:00 2001 From: zwright Date: Thu, 25 Sep 2025 15:37:09 -0400 Subject: [PATCH 12/54] KPMP-6223: new cols and rename --- data_management/services/dlu_management.py | 2 +- data_management/services/tableau.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/data_management/services/dlu_management.py b/data_management/services/dlu_management.py index 0ba6e13..4a8bdd6 100644 --- a/data_management/services/dlu_management.py +++ b/data_management/services/dlu_management.py @@ -191,7 +191,7 @@ def get_data_manager_data(self): """ SELECT dm.id, dm.dlu_package_id, dm.dlu_created, dm.dlu_submitter, dm.dlu_tis, dm.dlu_packageType, dm.dlu_subject_id, dm.dlu_error, dm.redcap_id, dm.known_specimen, dm.user_package_ready, - dm.package_validated, dm.ready_to_move_from_globus, dm.globus_dlu_status, dm.package_status, + dm.package_validated, dm.ready_to_move_from_globus, dm.globus_dlu_status, dm.dlu_upload_type, dm.upload_type_detail, dm.atlas_status, dm.current_owner, dm.ar_promotion_status, dm.sv_promotion_status, dm.release_version, r.release_date, dm.removed_from_globus, dm.notes FROM data_manager_data_v dm diff --git a/data_management/services/tableau.py b/data_management/services/tableau.py index bfa23f0..4e48cf3 100644 --- a/data_management/services/tableau.py +++ b/data_management/services/tableau.py @@ -34,8 +34,8 @@ def load_biopsy_tracking(self): def load_data_manager_data(self): self.truncate_data_manager_data() results = self.dlu_management.get_data_manager_data() - query = "INSERT INTO kpmp_dvc_integration.data_manager_data(id, dlu_package_id, dlu_created, dlu_submitter, dlu_tis, dlu_packageType, dlu_subject_id, dlu_error, redcap_id, known_specimen, user_package_ready, package_validated, ready_to_move_from_globus, globus_dlu_status, package_status, current_owner, ar_promotion_status, sv_promotion_status, release_version, release_date, removed_from_globus, notes) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" - records_modified = 0 + query = "INSERT INTO kpmp_dvc_integration.data_manager_data(id, dlu_package_id, dlu_created, dlu_submitter, dlu_tis, dlu_packageType, dlu_subject_id, dlu_error, redcap_id, known_specimen, user_package_ready, package_validated, ready_to_move_from_globus, globus_dlu_status, upload_type, upload_type_detail, atlas_status, current_owner, ar_promotion_status, sv_promotion_status, release_version, release_date, removed_from_globus, notes) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + records_modified = 0%s, for result in results: result["dlu_created"] = result["dlu_created"].strftime('%Y-%m-%d %H:%M:%S') insert_result = self.db_tableau.insert_data(query, tuple(result.values())) From b03b3d1695b80d84e00b01aab3e574dc3eed058d Mon Sep 17 00:00:00 2001 From: zwright Date: Thu, 25 Sep 2025 15:55:13 -0400 Subject: [PATCH 13/54] KPMP-6223: typo --- data_management/services/tableau.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_management/services/tableau.py b/data_management/services/tableau.py index 4e48cf3..40ffa6d 100644 --- a/data_management/services/tableau.py +++ b/data_management/services/tableau.py @@ -35,7 +35,7 @@ def load_data_manager_data(self): self.truncate_data_manager_data() results = self.dlu_management.get_data_manager_data() query = "INSERT INTO kpmp_dvc_integration.data_manager_data(id, dlu_package_id, dlu_created, dlu_submitter, dlu_tis, dlu_packageType, dlu_subject_id, dlu_error, redcap_id, known_specimen, user_package_ready, package_validated, ready_to_move_from_globus, globus_dlu_status, upload_type, upload_type_detail, atlas_status, current_owner, ar_promotion_status, sv_promotion_status, release_version, release_date, removed_from_globus, notes) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" - records_modified = 0%s, + records_modified = 0, for result in results: result["dlu_created"] = result["dlu_created"].strftime('%Y-%m-%d %H:%M:%S') insert_result = self.db_tableau.insert_data(query, tuple(result.values())) From c8649b54b280f66ea41dbadcd816d955b87e7cae Mon Sep 17 00:00:00 2001 From: Becky Reamy Date: Fri, 26 Sep 2025 11:09:44 -0400 Subject: [PATCH 14/54] KPMP-6195: calculate the filename and foldername --- data_management/services/dlu_management.py | 3 ++- data_management/services/slide_management.py | 17 ++++++++++++++--- .../services/tests/test_slide_management.py | 19 +++++++++++++++++++ 3 files changed, 35 insertions(+), 4 deletions(-) create mode 100644 data_management/services/tests/test_slide_management.py diff --git a/data_management/services/dlu_management.py b/data_management/services/dlu_management.py index 0ba6e13..b703f4a 100644 --- a/data_management/services/dlu_management.py +++ b/data_management/services/dlu_management.py @@ -231,7 +231,8 @@ def get_spectrack_redcap_record_id(self, kit_id): return None def insert_into_slide_scan_curation(self, values): - query = "INSERT INTO slide_scan_curation (image_id, kit_id, redcap_id, new_file_name) VALUES (%s, %s, %s, %s)" + query = "INSERT INTO slide_scan_curation (image_id, kit_id, redcap_id, new_file_name, source_file_name, " \ + "source_folder_name) VALUES (%s, %s, %s, %s, %s, %s)" self.db.insert_data(query, values) return query % values diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py index dec541a..e1e8e96 100644 --- a/data_management/services/slide_management.py +++ b/data_management/services/slide_management.py @@ -1,4 +1,5 @@ import logging +from pathlib import PureWindowsPath logger = logging.getLogger("services-dlu_package_watcher") logger.setLevel(logging.INFO) @@ -6,18 +7,23 @@ class SlideScanModel: - def __init__(self, image_id: str, redcap_id: str, kit_id: str, new_file_name: str): + def __init__(self, image_id: str, redcap_id: str, kit_id: str, new_file_name: str, source_file_name: str, + source_folder_name: str): self.image_id = image_id self.redcap_id = redcap_id self.kit_id = kit_id self.new_file_name = new_file_name + self.source_file_name = source_file_name + self.source_folder_name = source_folder_name def get_dmd_tuple(self): return ( self.image_id, self.kit_id, self.redcap_id, - self.new_file_name + self.new_file_name, + self.source_file_name, + self.source_folder_name ) @@ -73,8 +79,13 @@ def process_slide_manifest_imports(self): redcap_id = self.db.get_spectrack_redcap_record_id(kit_id) new_file_name = self.determine_new_slide_name(sample_id=record["barcode_id"], kit_id=kit_id, stain_info=record["stain"], block_id=record["block_id"]) + file_location = PureWindowsPath(record['file_location']) + source_file_name = file_location.name + source_folder_name = file_location.parent.name + slide_scan = SlideScanModel(image_id=image_id, redcap_id=redcap_id, kit_id=kit_id, - new_file_name=new_file_name) + new_file_name=new_file_name, source_file_name=source_file_name, + source_folder_name=source_folder_name) self.db.insert_into_slide_scan_curation(slide_scan.get_dmd_tuple()) # If we were unable to determine the filename, we want to update this with an error message, after a diff --git a/data_management/services/tests/test_slide_management.py b/data_management/services/tests/test_slide_management.py new file mode 100644 index 0000000..0983b3a --- /dev/null +++ b/data_management/services/tests/test_slide_management.py @@ -0,0 +1,19 @@ + +import os +from pathlib import PureWindowsPath +if __name__ == "__main__": + + + + file_location = PureWindowsPath(r'\\corefs2.med.umich.edu\shared4\path-aperio\prod\images\Hodgin\KPMP\KPMP slides 20250730 47377\S-2412-001898_082919.svs') + + filename_with_extension = file_location.name + print(filename_with_extension) + parent_folder = file_location.parent.name + print(parent_folder) + # source_file_name = os.path.basename(file_location) + # source_folder_name = os.path.dirname(file_location) + # print(source_folder_name) + # print("hi") + # print(source_file_name) + # print("bye") From 1f58f5892c726fffc5d9dc54ab71b5e89ffb53f7 Mon Sep 17 00:00:00 2001 From: Becky Reamy Date: Fri, 26 Sep 2025 13:29:47 -0400 Subject: [PATCH 15/54] KPMP-6195: handle issues with file_location and file name determination --- data_management/services/slide_management.py | 36 ++++++++++++++------ 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py index e1e8e96..b1e7f04 100644 --- a/data_management/services/slide_management.py +++ b/data_management/services/slide_management.py @@ -43,6 +43,7 @@ def determine_stain(stain_info, block_id): return "SIL" return None + def calculate_denominator(slides_for_kit, block_id): denominator = 0 for slide in slides_for_kit: @@ -74,24 +75,37 @@ def __init__(self, db): def process_slide_manifest_imports(self): new_records = self.db.get_new_slide_manifest_import_rows() for record in new_records: + record_in_error = False + error_message = "" kit_id = record["outside_acc"] image_id = record["image_id"] redcap_id = self.db.get_spectrack_redcap_record_id(kit_id) - new_file_name = self.determine_new_slide_name(sample_id=record["barcode_id"], kit_id=kit_id, - stain_info=record["stain"], block_id=record["block_id"]) - file_location = PureWindowsPath(record['file_location']) - source_file_name = file_location.name - source_folder_name = file_location.parent.name - + if record["barcode_id"] is not None: + new_file_name = self.determine_new_slide_name(sample_id=record["barcode_id"], kit_id=kit_id, + stain_info=record["stain"], block_id=record["block_id"]) + else: + new_file_name = None + record_in_error = True + error_message = "Missing sample id, unable to determine file name." + + # Sometimes the file location gets copied in with a single leading slash + if record["file_location"].count("\\") < 2: + source_file_name = None + source_folder_name = None + record_in_error = True + error_message = error_message + "Unable to determine source file or folder." + else: + file_location = PureWindowsPath(record['file_location']) + source_file_name = file_location.name + source_folder_name = file_location.parent.name + slide_scan = SlideScanModel(image_id=image_id, redcap_id=redcap_id, kit_id=kit_id, new_file_name=new_file_name, source_file_name=source_file_name, source_folder_name=source_folder_name) - self.db.insert_into_slide_scan_curation(slide_scan.get_dmd_tuple()) - # If we were unable to determine the filename, we want to update this with an error message, after a - # record is in slide_scan_curation - if new_file_name is None: - self.db.set_error_message_slide_scan_curation("Unknown stain type", image_id=image_id) + self.db.insert_into_slide_scan_curation(slide_scan.get_dmd_tuple()) + if record_in_error: + self.db.set_error_message_slide_scan_curation(image_id=image_id, error=error_message) def determine_new_slide_name(self, sample_id: str, kit_id: str, stain_info: str, block_id: str): slides_for_kit = self.db.get_slide_manifest_import_by_kit(kit_id, stain_info) From 44e81e52b79f6cb203469b64b3e829ad2ac4b3ef Mon Sep 17 00:00:00 2001 From: Zach Wright Date: Fri, 26 Sep 2025 13:37:10 -0400 Subject: [PATCH 16/54] KPMP-6223: fix args and extra comma --- data_management/services/tableau.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/data_management/services/tableau.py b/data_management/services/tableau.py index 40ffa6d..ec05b17 100644 --- a/data_management/services/tableau.py +++ b/data_management/services/tableau.py @@ -34,8 +34,9 @@ def load_biopsy_tracking(self): def load_data_manager_data(self): self.truncate_data_manager_data() results = self.dlu_management.get_data_manager_data() - query = "INSERT INTO kpmp_dvc_integration.data_manager_data(id, dlu_package_id, dlu_created, dlu_submitter, dlu_tis, dlu_packageType, dlu_subject_id, dlu_error, redcap_id, known_specimen, user_package_ready, package_validated, ready_to_move_from_globus, globus_dlu_status, upload_type, upload_type_detail, atlas_status, current_owner, ar_promotion_status, sv_promotion_status, release_version, release_date, removed_from_globus, notes) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" - records_modified = 0, + print(len(results)) + query = "INSERT INTO kpmp_dvc_integration.data_manager_data(id, dlu_package_id, dlu_created, dlu_submitter, dlu_tis, dlu_packageType, dlu_subject_id, dlu_error, redcap_id, known_specimen, user_package_ready, package_validated, ready_to_move_from_globus, globus_dlu_status, upload_type, upload_type_detail, atlas_status, current_owner, ar_promotion_status, sv_promotion_status, release_version, release_date, removed_from_globus, notes) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + records_modified = 0 for result in results: result["dlu_created"] = result["dlu_created"].strftime('%Y-%m-%d %H:%M:%S') insert_result = self.db_tableau.insert_data(query, tuple(result.values())) From 6b6c1a1f3f289da614f84eea8aa2189abd9777a9 Mon Sep 17 00:00:00 2001 From: HaneenT Date: Mon, 29 Sep 2025 11:21:13 -0400 Subject: [PATCH 17/54] KPMP-6233: fix log errors --- data_management/services/redcap.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_management/services/redcap.py b/data_management/services/redcap.py index a36183a..cbf62f9 100644 --- a/data_management/services/redcap.py +++ b/data_management/services/redcap.py @@ -82,7 +82,7 @@ def parse_redcap_records_by_participant(self, redcap_id, field_name): return "Intra-operative Needle Biopsy" else: logger.error( - f'Error: unknown value for record: {record["record"]} with field_name: {record["field_name"]} value: {record["value"]}' + f'Error: unknown value for record with field_name: {record["field_name"]} value: {record["value"]}' ) os.sys.exit() logger.debug("End: parse_redcap_records_by_participant") @@ -195,7 +195,7 @@ def parse_participant_records(self): else: logger.error( - f'Error: Additional fields found we are not mapping: {record["record"]} with field_name: {record["field_name"]} value: {record["value"]}' + f'Error: Additional fields found we are not mapping: record with field_name: {record["field_name"]} value: {record["value"]}' ) participant["redcap_tissue_source"] = "KPMP Recruitment Site" # hard-coded value provided by Jonas From 5290bd957dbb985dae0438ef5f9b2471efa3cdea Mon Sep 17 00:00:00 2001 From: HaneenT Date: Mon, 29 Sep 2025 11:23:51 -0400 Subject: [PATCH 18/54] "value" keyError --- data_management/services/redcap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_management/services/redcap.py b/data_management/services/redcap.py index cbf62f9..8cee072 100644 --- a/data_management/services/redcap.py +++ b/data_management/services/redcap.py @@ -195,7 +195,7 @@ def parse_participant_records(self): else: logger.error( - f'Error: Additional fields found we are not mapping: record with field_name: {record["field_name"]} value: {record["value"]}' + f'Error: Additional fields found we are not mapping: record with field_name: {record["field_name"]}' ) participant["redcap_tissue_source"] = "KPMP Recruitment Site" # hard-coded value provided by Jonas From 84f82436e33fc2c8d98dba0264e748fc8d5cab1b Mon Sep 17 00:00:00 2001 From: Becky Reamy Date: Wed, 1 Oct 2025 12:32:41 -0400 Subject: [PATCH 19/54] KPMP-6195: change col for sample id --- data_management/services/slide_management.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py index b1e7f04..de189c1 100644 --- a/data_management/services/slide_management.py +++ b/data_management/services/slide_management.py @@ -58,7 +58,7 @@ def calculate_numerator(block_id, sample_id, slides_for_kit): numerator = 1 # Keep counting until we find this slide for slide in slides_for_kit: - if slide['barcode_id'] != sample_id: + if slide['accession'] != sample_id: if block_id != 'OCT' and slide['block_id'] != 'OCT': numerator = numerator + 1 elif block_id == 'OCT' and slide['block_id'] == 'OCT': @@ -80,8 +80,8 @@ def process_slide_manifest_imports(self): kit_id = record["outside_acc"] image_id = record["image_id"] redcap_id = self.db.get_spectrack_redcap_record_id(kit_id) - if record["barcode_id"] is not None: - new_file_name = self.determine_new_slide_name(sample_id=record["barcode_id"], kit_id=kit_id, + if record["accession"] is not None: + new_file_name = self.determine_new_slide_name(sample_id=record["accession"], kit_id=kit_id, stain_info=record["stain"], block_id=record["block_id"]) else: new_file_name = None From fba4ed4f2d828699cc084c8fbf3caf47e47771e3 Mon Sep 17 00:00:00 2001 From: rlreamy <34109594+rlreamy@users.noreply.github.com> Date: Thu, 2 Oct 2025 14:22:24 -0400 Subject: [PATCH 20/54] Delete data_management/services/tests/test_slide_management.py deleted dummy test file used to figure out how to get filename --- .../services/tests/test_slide_management.py | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 data_management/services/tests/test_slide_management.py diff --git a/data_management/services/tests/test_slide_management.py b/data_management/services/tests/test_slide_management.py deleted file mode 100644 index 0983b3a..0000000 --- a/data_management/services/tests/test_slide_management.py +++ /dev/null @@ -1,19 +0,0 @@ - -import os -from pathlib import PureWindowsPath -if __name__ == "__main__": - - - - file_location = PureWindowsPath(r'\\corefs2.med.umich.edu\shared4\path-aperio\prod\images\Hodgin\KPMP\KPMP slides 20250730 47377\S-2412-001898_082919.svs') - - filename_with_extension = file_location.name - print(filename_with_extension) - parent_folder = file_location.parent.name - print(parent_folder) - # source_file_name = os.path.basename(file_location) - # source_folder_name = os.path.dirname(file_location) - # print(source_folder_name) - # print("hi") - # print(source_file_name) - # print("bye") From 6071ec66d318d76c2689d0b6c97ea03423cf1b8b Mon Sep 17 00:00:00 2001 From: Zach Wright Date: Mon, 6 Oct 2025 14:49:49 -0400 Subject: [PATCH 21/54] Update data_manager_data.sql --- data_management/sql/data_manager_data.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_management/sql/data_manager_data.sql b/data_management/sql/data_manager_data.sql index 7778095..7c25149 100644 --- a/data_management/sql/data_manager_data.sql +++ b/data_management/sql/data_manager_data.sql @@ -24,4 +24,4 @@ CREATE TABLE `data_manager_data` ( `notes` text DEFAULT NULL ) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci; -ALTER TABLE data_manager_data ADD release_date DATETIME AFTER release_version; \ No newline at end of file +ALTER TABLE data_manager_data ADD release_date DATETIME AFTER release_version; From a3744c072e46dc51abf2321b8f05b7a9b0829d39 Mon Sep 17 00:00:00 2001 From: Zach Wright Date: Mon, 6 Oct 2025 14:52:03 -0400 Subject: [PATCH 22/54] Update data_manager_data.sql --- data_management/sql/data_manager_data.sql | 47 ++++++++++++----------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/data_management/sql/data_manager_data.sql b/data_management/sql/data_manager_data.sql index 7c25149..12504dc 100644 --- a/data_management/sql/data_manager_data.sql +++ b/data_management/sql/data_manager_data.sql @@ -1,27 +1,28 @@ -- kpmp_dvc_integration.data_manager_data definition CREATE TABLE `data_manager_data` ( - `id` int(11) DEFAULT 0, - `dlu_package_id` varchar(100) DEFAULT NULL, - `dlu_created` datetime NOT NULL, - `dlu_submitter` varchar(100) DEFAULT NULL, - `dlu_tis` varchar(100) DEFAULT NULL, - `dlu_packageType` varchar(100) DEFAULT NULL, - `dlu_subject_id` varchar(200) DEFAULT NULL, - `dlu_error` tinyint(1) DEFAULT 0, - `redcap_id` text DEFAULT NULL, - `known_specimen` text DEFAULT NULL, - `user_package_ready` char(1) DEFAULT 'N', - `package_validated` text DEFAULT NULL, - `ready_to_move_from_globus` varchar(100) DEFAULT NULL, - `globus_dlu_status` varchar(255) DEFAULT NULL, - `package_status` text DEFAULT NULL, - `current_owner` varchar(100) DEFAULT NULL, - `ar_promotion_status` varchar(100) DEFAULT NULL, - `sv_promotion_status` varchar(100) DEFAULT NULL, - `release_version` varchar(100) DEFAULT NULL, - `removed_from_globus` varchar(100) DEFAULT NULL, - `notes` text DEFAULT NULL + `id` int(11) DEFAULT 0, + `dlu_package_id` varchar(100) DEFAULT NULL, + `dlu_created` datetime NOT NULL, + `dlu_submitter` varchar(100) DEFAULT NULL, + `dlu_tis` varchar(100) DEFAULT NULL, + `dlu_packageType` varchar(100) DEFAULT NULL, + `dlu_subject_id` varchar(200) DEFAULT NULL, + `dlu_error` tinyint(1) DEFAULT 0, + `redcap_id` text DEFAULT NULL, + `known_specimen` text DEFAULT NULL, + `user_package_ready` char(1) DEFAULT 'N', + `package_validated` text DEFAULT NULL, + `ready_to_move_from_globus` varchar(100) DEFAULT NULL, + `globus_dlu_status` varchar(255) DEFAULT NULL, + `upload_type` varchar(100) DEFAULT NULL, + `upload_type_detail` text DEFAULT NULL, + `atlas_status` text DEFAULT NULL, + `current_owner` varchar(100) DEFAULT NULL, + `ar_promotion_status` varchar(100) DEFAULT NULL, + `sv_promotion_status` varchar(100) DEFAULT NULL, + `release_version` varchar(100) DEFAULT NULL, + `release_date` datetime DEFAULT NULL, + `removed_from_globus` varchar(100) DEFAULT NULL, + `notes` text DEFAULT NULL ) ENGINE=InnoDB DEFAULT CHARSET=latin1 COLLATE=latin1_swedish_ci; - -ALTER TABLE data_manager_data ADD release_date DATETIME AFTER release_version; From 18807e41cfdc5ed2c7e054e7e689bd82e34f283c Mon Sep 17 00:00:00 2001 From: dert1129 Date: Wed, 8 Oct 2025 09:40:27 -0400 Subject: [PATCH 23/54] alides found missing slides table are marked as error --- data_management/lib/mysql_connection.py | 14 ++++++++++++++ data_management/services/dlu_management.py | 16 +++++++++++++++- data_management/services/slide_management.py | 20 ++++++++++++++++---- data_management/watch_files.py | 4 ++-- 4 files changed, 47 insertions(+), 7 deletions(-) diff --git a/data_management/lib/mysql_connection.py b/data_management/lib/mysql_connection.py index f3d67f7..36678aa 100644 --- a/data_management/lib/mysql_connection.py +++ b/data_management/lib/mysql_connection.py @@ -123,6 +123,20 @@ def insert_data(self, sql, data): finally: self.database.commit() self.cursor.close() + + def insert_data_no_alert(self, sql, data): + try: + self.get_db_cursor() + self.cursor.execute(sql, data) + warning = self.cursor.fetchwarnings() + if warning is not None: + print(warning) + except: + message = f"Error: Cannot insert with query: {sql}; and the data: {data}" + logger.error(message) + finally: + self.database.commit() + self.cursor.close() def get_data(self, sql, query_data=None): try: diff --git a/data_management/services/dlu_management.py b/data_management/services/dlu_management.py index 5b08ee4..0ea5bf9 100644 --- a/data_management/services/dlu_management.py +++ b/data_management/services/dlu_management.py @@ -101,6 +101,16 @@ def update_dlu_package(self, package_id: str, fields_values: dict): values = query_info["values"][0:] + (package_id,) query = "UPDATE data_manager_data_v SET " + query_info["set_clause"] + " WHERE dlu_package_id = %s" self.db.insert_data(query, values) + + def get_missing_slides(self, redcap_id: str): + return self.db.get_data( + "select * from missing_slides_v where spectrack_redcap_record_id = %", + redcap_id,), + + def update_missing_slides(self, redcap_id: str): + return self.db.get_data( + "update slide_scan_curation set missing_slides = 1 where redcap_id = %s", redcap_id, + ), def insert_dlu_file(self, values): query = "INSERT INTO dlu_file (dlu_fileName, dlu_package_id, dlu_file_id, dlu_filesize, dlu_md5checksum, dlu_modified_at, dlu_metadata) VALUES(%s, %s, %s, %s, %s, %s, %s)" @@ -220,7 +230,7 @@ def get_equal_num_rows(self): def get_new_slide_manifest_import_rows(self): return self.db.get_data("SELECT * FROM slide_manifest_import WHERE image_id NOT IN " - "(SELECT image_id FROM slide_scan_curation)") + "(SELECT image_id FROM slide_scan_curation where missing_slides = 0)") def get_spectrack_redcap_record_id(self, kit_id): result = self.db.get_data("SELECT spectrack_redcap_record_id FROM spectrack_specimen " @@ -244,6 +254,10 @@ def get_slide_manifest_import_by_kit(self, kit_id, stain): def set_error_message_slide_scan_curation(self, error, image_id): self.db.insert_data("UPDATE slide_scan_curation set error_message = %s where image_id = %s", (error, image_id,)) + + def set_error_message_slide_scan_curation_redcap_id(self, error, redcap_id): + self.db.insert_data_no_alert("UPDATE slide_scan_curation set error_message = %s where redcap_id = %s", + (error, redcap_id,)) if __name__ == "__main__": diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py index de189c1..ad21b64 100644 --- a/data_management/services/slide_management.py +++ b/data_management/services/slide_management.py @@ -86,24 +86,36 @@ def process_slide_manifest_imports(self): else: new_file_name = None record_in_error = True - error_message = "Missing sample id, unable to determine file name." + error_message = "Missing sample id, unable to determine file name; " # Sometimes the file location gets copied in with a single leading slash if record["file_location"].count("\\") < 2: source_file_name = None source_folder_name = None record_in_error = True - error_message = error_message + "Unable to determine source file or folder." + error_message = error_message + "Unable to determine source file or folder; " else: file_location = PureWindowsPath(record['file_location']) source_file_name = file_location.name source_folder_name = file_location.parent.name + + check_missing_slides = self.db.get_missing_slides(redcap_id) + if len(check_missing_slides) >= 1: - slide_scan = SlideScanModel(image_id=image_id, redcap_id=redcap_id, kit_id=kit_id, + slide_scan = SlideScanModel(image_id=image_id, redcap_id=redcap_id, kit_id=kit_id, new_file_name=new_file_name, source_file_name=source_file_name, source_folder_name=source_folder_name) - self.db.insert_into_slide_scan_curation(slide_scan.get_dmd_tuple()) + self.db.insert_into_slide_scan_curation(slide_scan.get_dmd_tuple()) + else: + + error_message += "There are missing slides for participant " + redcap_id + ";" + logger.info(error_message) + self.db.update_missing_slides(redcap_id) + + # Can't use record_in_error here because we can't set an error message for an image_id that doens't exist + self.db.set_error_message_slide_scan_curation_redcap_id(error=error_message, redcap_id=redcap_id) + if record_in_error: self.db.set_error_message_slide_scan_curation(image_id=image_id, error=error_message) diff --git a/data_management/watch_files.py b/data_management/watch_files.py index c5ae0dd..de4ecd1 100644 --- a/data_management/watch_files.py +++ b/data_management/watch_files.py @@ -119,5 +119,5 @@ def move_packages_to_DLU(self, packages): dlu_watcher.pickup_waiting_files() while True: dlu_watcher.watch_for_files() - time.sleep(60) - dlu_watcher.watch_for_side_manifest_records() \ No newline at end of file + dlu_watcher.watch_for_side_manifest_records() + time.sleep(60) \ No newline at end of file From e949c2a508e120c838965c6b48d8bbdf4f16e789 Mon Sep 17 00:00:00 2001 From: Becky Reamy Date: Wed, 8 Oct 2025 13:32:30 -0400 Subject: [PATCH 24/54] K:PMP-5807: renamed methods to be more accurate --- .../services/dlu_package_inventory.py | 6 ++-- data_management/watch_files.py | 35 +++++++++---------- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/data_management/services/dlu_package_inventory.py b/data_management/services/dlu_package_inventory.py index 03fc734..8809d9d 100644 --- a/data_management/services/dlu_package_inventory.py +++ b/data_management/services/dlu_package_inventory.py @@ -17,19 +17,19 @@ def reconnect(self): self.db = MYSQLConnection() self.database = self.db.get_db_connection() - def get_dlu_file(self, status): + def get_dlu_package(self, status): return self.db.get_data( 'SELECT * FROM data_management.data_manager_data_v WHERE ready_to_move_from_globus = %s AND (globus_dlu_status IS NULL OR globus_dlu_status = "recalled")', (status,) ) - def set_dlu_file_waiting(self, status, package_id): + def set_dlu_package_waiting(self, status, package_id): return self.db.insert_data( 'UPDATE data_management.data_manager_data_v SET globus_dlu_status = "waiting" WHERE ready_to_move_from_globus = %s AND dlu_package_id = %s', (status, package_id,) ) - def get_waiting_files(self): + def get_waiting_packages(self): return self.db.get_data( 'Select * from data_management.data_manager_data_v where globus_dlu_status = "waiting" and ready_to_move_from_globus = "yes"' ) diff --git a/data_management/watch_files.py b/data_management/watch_files.py index de4ecd1..a226903 100644 --- a/data_management/watch_files.py +++ b/data_management/watch_files.py @@ -31,15 +31,15 @@ def __init__ (self, db: DLUPackageInventory = None): self.dlu_state = DLUState() self.slide_management = SlideManagement(self.dlu_management) - def watch_for_files(self): - files = self.db.get_dlu_file("yes") - if len(files) == 0: + def watch_for_packages(self): + packages = self.db.get_dlu_package("yes") + if len(packages) == 0: logger.info( "No records were found with status 'yes' " ) else: - self.update_files_for_globus(files) - self.move_packages_to_DLU(files) + self.update_packages_for_globus(packages) + self.move_packages_to_DLU(packages) def watch_for_side_manifest_records(self): equal_num_rows = self.dlu_management.get_equal_num_rows() @@ -52,10 +52,10 @@ def update_slide_scan_curation(self): logger.info("Importing new row(s) into slide_scan_curation") self.slide_management.process_slide_manifest_imports() - def update_files_for_globus(self, files): - for index, file_result in enumerate(files): - logger.info("Setting file status to 'waiting' on package " + file_result['dlu_package_id']) - self.db.set_dlu_file_waiting("yes", file_result['dlu_package_id']) + def update_packages_for_globus(self, packages): + for index, package_result in enumerate(packages): + logger.info("Setting file status to 'waiting' on package " + package_result['dlu_package_id']) + self.db.set_dlu_package_waiting("yes", package_result['dlu_package_id']) def process_file_paths(self, file_list: list[DLUFile]) -> list: dlu_files = [] @@ -64,14 +64,14 @@ def process_file_paths(self, file_list: list[DLUFile]) -> list: dlu_files.append(file) return dlu_files - def pickup_waiting_files(self): - files_in_waiting = self.db.get_waiting_files() - if len(files_in_waiting) == 0: + def pickup_waiting_packages(self): + packages_in_waiting = self.db.get_waiting_packages() + if len(packages_in_waiting) == 0: return logger.info( "No records were found with status 'waiting'" ) else: - self.move_packages_to_DLU(files_in_waiting) + self.move_packages_to_DLU(packages_in_waiting) def move_packages_to_DLU(self, packages): file_list = None @@ -113,11 +113,10 @@ def move_packages_to_DLU(self, packages): self.dlu_state.clear_cache() - if __name__ == "__main__": dlu_watcher = DLUWatcher() - dlu_watcher.pickup_waiting_files() - while True: - dlu_watcher.watch_for_files() + dlu_watcher.pickup_waiting_packages() + while True: + dlu_watcher.watch_for_packages() dlu_watcher.watch_for_side_manifest_records() - time.sleep(60) \ No newline at end of file + time.sleep(60) From e744917168ce267dcbb1b9f9f26f12e8c725a5ad Mon Sep 17 00:00:00 2001 From: Becky Reamy Date: Wed, 8 Oct 2025 14:22:27 -0400 Subject: [PATCH 25/54] KPMP-5807: Add some of the error handling --- data_management/services/dlu_filesystem.py | 42 ++++++++++++---------- data_management/services/dlu_management.py | 12 +++++++ data_management/watch_files.py | 26 +++++++++++++- 3 files changed, 60 insertions(+), 20 deletions(-) diff --git a/data_management/services/dlu_filesystem.py b/data_management/services/dlu_filesystem.py index 1285dcf..0384816 100644 --- a/data_management/services/dlu_filesystem.py +++ b/data_management/services/dlu_filesystem.py @@ -188,7 +188,8 @@ def validate_package_directories(self, package_id: str): logger.error("Directory for package " + package_id + " failed validation.") return success - def process_globus_directory(self, directoryListing, globusDirectories: list[DirectoryInfo], packageId, initialDir, calculate_checksums: bool = True): + def process_globus_directory(self, directoryListing, globusDirectories: list[DirectoryInfo], packageId, initialDir, + calculate_checksums: bool = True): for dir in globusDirectories: prefix = "" if not initialDir == "": @@ -199,33 +200,36 @@ def process_globus_directory(self, directoryListing, globusDirectories: list[Dir globusDirectories = [] for item in dir.file_details: if os.path.isdir(item.path): - globusDirectories.append(DirectoryInfo(item.path, calculate_checksums = calculate_checksums)) + globusDirectories.append(DirectoryInfo(item.path, calculate_checksums=calculate_checksums)) else: globusFiles.append(item) directoryListing[currentDir] = globusFiles if len(globusDirectories) > 0: - self.process_globus_directory(directoryListing, globusDirectories, packageId, currentDir,calculate_checksums) + self.process_globus_directory(directoryListing, globusDirectories, packageId, currentDir, + calculate_checksums) return directoryListing - def match_files(self, packageId, calculate_checksums: bool = True) -> list[DLUFile]: - topLevelDir = DirectoryInfo(self.globus_data_directory + '/' + self.globus_dir_prefix + packageId, calculate_checksums = calculate_checksums) - globusFiles = [] - globusDirectories = [] - for obj in topLevelDir.file_details: + def match_files(self, package_id: str, calculate_checksums: bool = True) -> list[DLUFile]: + top_level_dir = DirectoryInfo(self.globus_data_directory + '/' + self.globus_dir_prefix + package_id, + calculate_checksums=calculate_checksums) + globus_files = [] + globus_directories = [] + for obj in top_level_dir.file_details: if os.path.isdir(obj.path): - directory = DirectoryInfo(obj.path, calculate_checksums = calculate_checksums) - globusDirectories.append(directory) + directory = DirectoryInfo(obj.path, calculate_checksums=calculate_checksums) + globus_directories.append(directory) else: - globusFiles.append(obj) - filesInGlobusDirectories = {} - filesInGlobusDirectories[""] = globusFiles - currentDir = "" - filesInGlobusDirectories = self.process_globus_directory(filesInGlobusDirectories, globusDirectories, packageId, currentDir, calculate_checksums) - return self.get_globus_file_paths(filesInGlobusDirectories) - - def get_globus_file_paths(self, filesInGlobusDirectories: dict[str, list[DLUFile]]) -> list[DLUFile]: + globus_files.append(obj) + files_in_globus_directories = {} + files_in_globus_directories[""] = globus_files + current_dir = "" + files_in_globus_directories = self.process_globus_directory(files_in_globus_directories, globus_directories, + package_id, current_dir, calculate_checksums) + return self.get_globus_file_paths(files_in_globus_directories) + + def get_globus_file_paths(self, files_in_globus_directories: dict[str, list[DLUFile]]) -> list[DLUFile]: fileList = [] - for dir, files in filesInGlobusDirectories.items(): + for dir, files in files_in_globus_directories.items(): for file in files: prefix = dir + "/" if dir else "" file.name = prefix + file.name diff --git a/data_management/services/dlu_management.py b/data_management/services/dlu_management.py index 0ea5bf9..6796c5c 100644 --- a/data_management/services/dlu_management.py +++ b/data_management/services/dlu_management.py @@ -259,6 +259,18 @@ def set_error_message_slide_scan_curation_redcap_id(self, error, redcap_id): self.db.insert_data_no_alert("UPDATE slide_scan_curation set error_message = %s where redcap_id = %s", (error, redcap_id,)) + def find_slide_scan_info_by_package_id(self, package_id): + self.db.get_data("SELECT * FROM slide_scan_v WHERE dlu_package_id = %s", + (package_id,)) + + def is_package_missing_slides(self, package_id): + self.db.get_data("SELECT * FROM slide_scan_v WHERE dlu_package_id = %s and missing_slides = 1", + (package_id,)) + + def is_slides_in_error(self, package_id): + self.db.get_data("SELECT * FROM slide_scan_curation WHERE dlu_package_id = %s and error_message IS NOT NULL", + (package_id,)) + if __name__ == "__main__": dlu_management = DluManagement() diff --git a/data_management/watch_files.py b/data_management/watch_files.py index a226903..13ba68a 100644 --- a/data_management/watch_files.py +++ b/data_management/watch_files.py @@ -87,6 +87,11 @@ def move_packages_to_DLU(self, packages): self.dlu_management.update_dlu_package(package_id, { "globus_dlu_status": error_msg }) continue + if package['dlu_packageType'] == 'Whole Slide Images' and package['globus_dlu_status'] != 'recalled': + success = self.do_wsi_file_renames(globus_data_directory, package_id) + if not success: + continue + directory_info = DirectoryInfo(globus_data_directory) if directory_info.file_count == 0 and directory_info.subdir_count == 0: @@ -94,7 +99,7 @@ def move_packages_to_DLU(self, packages): logger.info(error_msg + " Skipping.") self.dlu_management.update_dlu_package(package_id, { "globus_dlu_status": error_msg }) continue - + if directory_info.file_count == 0 and directory_info.subdir_count == 1: contents = "".join(directory_info.dir_contents) top_level_subdir = package_id + "/" + contents @@ -112,6 +117,25 @@ def move_packages_to_DLU(self, packages): self.dlu_state.set_package_state(package_id, PackageState.UPLOAD_SUCCEEDED) self.dlu_state.clear_cache() + def do_wsi_file_renames(self, globus_data_directory: str, package_id: str): + error_msg = "" + slide_scan_info = self.dlu_management.find_slide_scan_info_by_package_id(package_id) + if slide_scan_info is None or len(slide_scan_info) == 0: + error_msg = "Error: package " + package_id + " has no info in slide_scan_v" + + missing_slides = self.dlu_management.is_package_missing_slides(package_id) + if missing_slides is not None and len(missing_slides) > 0: + error_msg = "Error: package " + package_id + " is missing slides" + slides_in_error = self.dlu_management.is_slides_in_error(package_id) + if slides_in_error is not None and len(slides_in_error) > 0: + error_msg = "Error: package " + package_id + " has slides in error" + + if error_msg != "": + self.dlu_management.update_dlu_package(package_id, {"globus_dlu_status": error_msg}) + return False + + return True + if __name__ == "__main__": dlu_watcher = DLUWatcher() From e5ed421cb3b14941f608e7b6e4ed0bb65dc668fc Mon Sep 17 00:00:00 2001 From: dert1129 Date: Thu, 9 Oct 2025 11:46:04 -0400 Subject: [PATCH 26/54] make a more useful error message --- data_management/lib/mysql_connection.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/data_management/lib/mysql_connection.py b/data_management/lib/mysql_connection.py index 36678aa..cd20f44 100644 --- a/data_management/lib/mysql_connection.py +++ b/data_management/lib/mysql_connection.py @@ -146,13 +146,12 @@ def get_data(self, sql, query_data=None): for row in self.cursor: data.append(row) return data - except: - message = "Error: Can't get data_management data." - logger.error(message) + except Exception as error: + logger.error(str(error)) requests.post( slack_url, headers={'Content-type': 'application/json', }, - data='{"text":"' + message + '"}' + data='{"text":"' + "Error: " + str(error) + '"}' ) finally: self.cursor.close() From 4e1dc9c49cc9c23410fb576026369bc6d57e65e6 Mon Sep 17 00:00:00 2001 From: Becky Reamy Date: Thu, 9 Oct 2025 13:06:25 -0400 Subject: [PATCH 27/54] KPMP-5807: Finish error handling before rename --- data_management/services/dlu_filesystem.py | 3 ++ data_management/services/dlu_management.py | 4 +++ data_management/watch_files.py | 38 +++++++++++++++++----- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/data_management/services/dlu_filesystem.py b/data_management/services/dlu_filesystem.py index 0384816..df3232c 100644 --- a/data_management/services/dlu_filesystem.py +++ b/data_management/services/dlu_filesystem.py @@ -235,3 +235,6 @@ def get_globus_file_paths(self, files_in_globus_directories: dict[str, list[DLUF file.name = prefix + file.name fileList.append(file) return fileList + + def validate_all_wsi_files_present(self, ): + return True \ No newline at end of file diff --git a/data_management/services/dlu_management.py b/data_management/services/dlu_management.py index 6796c5c..6ccf2e8 100644 --- a/data_management/services/dlu_management.py +++ b/data_management/services/dlu_management.py @@ -271,6 +271,10 @@ def is_slides_in_error(self, package_id): self.db.get_data("SELECT * FROM slide_scan_curation WHERE dlu_package_id = %s and error_message IS NOT NULL", (package_id,)) + def find_not_approved_filenames(self, package_id): + self.db.get_data("SELECT * FROM slide_scan_curation WHERE approve_file_name = 'yes' AND dlu_package_id = %s", + (package_id,)) + if __name__ == "__main__": dlu_management = DluManagement() diff --git a/data_management/watch_files.py b/data_management/watch_files.py index 13ba68a..8901309 100644 --- a/data_management/watch_files.py +++ b/data_management/watch_files.py @@ -93,11 +93,7 @@ def move_packages_to_DLU(self, packages): continue directory_info = DirectoryInfo(globus_data_directory) - - if directory_info.file_count == 0 and directory_info.subdir_count == 0: - error_msg = "Error: package " + package_id + " has no files or top level subdirectory" - logger.info(error_msg + " Skipping.") - self.dlu_management.update_dlu_package(package_id, { "globus_dlu_status": error_msg }) + if not self.is_directory_valid(directory_info, package_id): continue if directory_info.file_count == 0 and directory_info.subdir_count == 1: @@ -121,14 +117,33 @@ def do_wsi_file_renames(self, globus_data_directory: str, package_id: str): error_msg = "" slide_scan_info = self.dlu_management.find_slide_scan_info_by_package_id(package_id) if slide_scan_info is None or len(slide_scan_info) == 0: - error_msg = "Error: package " + package_id + " has no info in slide_scan_v" + error_msg = "Error: Package not found in slide_scan_v" missing_slides = self.dlu_management.is_package_missing_slides(package_id) if missing_slides is not None and len(missing_slides) > 0: - error_msg = "Error: package " + package_id + " is missing slides" + error_msg = "Error: Package is missing slides" slides_in_error = self.dlu_management.is_slides_in_error(package_id) if slides_in_error is not None and len(slides_in_error) > 0: - error_msg = "Error: package " + package_id + " has slides in error" + error_msg = "Error: Package has some slides in error" + unapproved_files = self.dlu_management.find_not_approved_filenames(package_id) + if unapproved_files is not None and len(unapproved_files) > 0: + error_msg = "Error: Package has unapproved filenames" + + directory_info = DirectoryInfo(globus_data_directory) + if not self.is_directory_valid(directory_info, package_id): + return False + + if directory_info.file_count == 0 or directory_info.file_count != len(slide_scan_info): + error_msg = "Error: Globus file count does not match expectation" + + file_list = self.dlu_file_handler.match_files(package_id) + expected_slides = [] + for slide in slide_scan_info: + expected_slides.append(slide['source_file_name']) + for file in file_list: + if file.name not in expected_slides: + error_msg = "Error: Filenames in directory do not match slide_scan_curation info" + continue if error_msg != "": self.dlu_management.update_dlu_package(package_id, {"globus_dlu_status": error_msg}) @@ -136,6 +151,13 @@ def do_wsi_file_renames(self, globus_data_directory: str, package_id: str): return True + def is_directory_valid(self, directory_info, package_id): + if directory_info.file_count == 0 and directory_info.subdir_count == 0: + error_msg = "Error: package " + package_id + " has no files or top level subdirectory" + logger.info(error_msg + " Skipping.") + self.dlu_management.update_dlu_package(package_id, {"globus_dlu_status": error_msg}) + return False + if __name__ == "__main__": dlu_watcher = DLUWatcher() From 63874baa0f66ee18b017fddcbc9e3aa58f223487 Mon Sep 17 00:00:00 2001 From: dert1129 Date: Thu, 9 Oct 2025 14:14:48 -0400 Subject: [PATCH 28/54] fill in package_ids that are null in slide_scan_curation --- data_management/services/dlu_management.py | 16 ++++++++++++++++ data_management/services/slide_management.py | 17 ++++++++++++++++- data_management/watch_files.py | 4 ++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/data_management/services/dlu_management.py b/data_management/services/dlu_management.py index 0ea5bf9..ab85948 100644 --- a/data_management/services/dlu_management.py +++ b/data_management/services/dlu_management.py @@ -239,6 +239,22 @@ def get_spectrack_redcap_record_id(self, kit_id): return result[0]["spectrack_redcap_record_id"] else: return None + + def get_redcap_ids_with_null_package_id(self): + return self.db.get_data( + "select unique redcap_id from slide_scan_curation where dlu_package_id is null and error_message is null", + (None), + ) + + def get_package_ids_for_redcap_id(self, redcap_id): + return self.db.get_data( + "select dlu_package_id from dlu_package_inventory where dlu_subject_id = %s and globus_dlu_status = 'success'", (redcap_id,) + ) + + def update_package_ids_in_slide_scan_curation(self, redcap_id, package_id): + return self.db.insert_data( + "update slide_scan_curation set dlu_package_id = %s where redcap_id = %s and dlu_package_id is null and error_message is null", + (package_id, redcap_id,)) def insert_into_slide_scan_curation(self, values): query = "INSERT INTO slide_scan_curation (image_id, kit_id, redcap_id, new_file_name, source_file_name, " \ diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py index ad21b64..5a630be 100644 --- a/data_management/services/slide_management.py +++ b/data_management/services/slide_management.py @@ -113,7 +113,7 @@ def process_slide_manifest_imports(self): logger.info(error_message) self.db.update_missing_slides(redcap_id) - # Can't use record_in_error here because we can't set an error message for an image_id that doens't exist + # Can't use record_in_error here because we can't set an error message for an image_id that doesn't exist self.db.set_error_message_slide_scan_curation_redcap_id(error=error_message, redcap_id=redcap_id) if record_in_error: @@ -133,3 +133,18 @@ def determine_new_slide_name(self, sample_id: str, kit_id: str, stain_info: str, return None else: return sample_id + "_" + stain_type + "_" + str(numerator) + "of" + str(denominator) + ".svs" + + def fill_in_package_ids(self): + redcap_id_list = self.db.get_redcap_ids_with_null_package_id() + if len(redcap_id_list) != 0: + for row in redcap_id_list: + redcap_id = row['redcap_id'] + package_id_list = self.db.get_package_ids_for_redcap_id(redcap_id) + if None not in package_id_list and len(package_id_list) == 1: + package_id = package_id_list[0]['dlu_package_id'] + self.db.update_package_ids_in_slide_scan_curation(redcap_id=redcap_id, package_id=package_id) + logger.info("Updated package id " + package_id + " for redcap id " + redcap_id) + elif len(package_id_list) > 1: + error_message = "Multiple dlu_package_ids found for redcap_id " + redcap_id + ", unable to fill in package id." + logger.info(error_message) + self.db.set_error_message_slide_scan_curation_redcap_id(error=error_message, redcap_id=redcap_id) \ No newline at end of file diff --git a/data_management/watch_files.py b/data_management/watch_files.py index de4ecd1..1c0bc5e 100644 --- a/data_management/watch_files.py +++ b/data_management/watch_files.py @@ -72,6 +72,9 @@ def pickup_waiting_files(self): ) else: self.move_packages_to_DLU(files_in_waiting) + + def fill_in_null_package_ids(self): + self.slide_management.fill_in_package_ids() def move_packages_to_DLU(self, packages): file_list = None @@ -120,4 +123,5 @@ def move_packages_to_DLU(self, packages): while True: dlu_watcher.watch_for_files() dlu_watcher.watch_for_side_manifest_records() + dlu_watcher.fill_in_null_package_ids() time.sleep(60) \ No newline at end of file From 30e35dd464b942584f93071caeea91811ba9a54f Mon Sep 17 00:00:00 2001 From: Becky Reamy Date: Thu, 9 Oct 2025 14:51:37 -0400 Subject: [PATCH 29/54] KPMP-5807: rename files --- data_management/services/dlu_filesystem.py | 6 ++++++ data_management/watch_files.py | 10 ++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/data_management/services/dlu_filesystem.py b/data_management/services/dlu_filesystem.py index df3232c..052b511 100644 --- a/data_management/services/dlu_filesystem.py +++ b/data_management/services/dlu_filesystem.py @@ -114,6 +114,12 @@ def chown_dir(self, package_id: str, files: list[DLUFile], user_id): if os.stat(subdir_path).st_uid != user_id or os.stat(subdir_path).st_gid != int(os.environ['dlu_group']): os.chown(subdir_path, user_id, int(os.environ['dlu_group'])) + def rename_files(self, file_list: list[DLUFile], slide_name_map, package_id ): + source_package_directory = self.globus_data_directory + '/' + self.globus_dir_prefix + package_id + for file in file_list: + os.rename(os.path.join(source_package_directory, file.name), + os.path.join(source_package_directory, slide_name_map[file.name])) + def copy_files(self, package_id: str, file_list: list[DLUFile], preserve_path: bool = False, no_src_package: bool = False): files_copied = 0 source_wd = os.getcwd() diff --git a/data_management/watch_files.py b/data_management/watch_files.py index 8901309..df757ad 100644 --- a/data_management/watch_files.py +++ b/data_management/watch_files.py @@ -92,6 +92,7 @@ def move_packages_to_DLU(self, packages): if not success: continue + # We do end up doing this check twice for WSIs but we are modifying the filenames, so it is probably good directory_info = DirectoryInfo(globus_data_directory) if not self.is_directory_valid(directory_info, package_id): continue @@ -101,7 +102,7 @@ def move_packages_to_DLU(self, packages): top_level_subdir = package_id + "/" + contents file_list = self.dlu_file_handler.match_files(top_level_subdir) else: - file_list = self.dlu_file_handler.match_files(package_id) + file_list = self.dlu_file_handler.match_files(package_id) self.dlu_file_handler.copy_files(package_id, self.process_file_paths(directory_info.file_details)) self.dlu_file_handler.chown_dir(package_id, file_list, int(os.environ['dlu_user'])) @@ -131,15 +132,19 @@ def do_wsi_file_renames(self, globus_data_directory: str, package_id: str): directory_info = DirectoryInfo(globus_data_directory) if not self.is_directory_valid(directory_info, package_id): + # This method logs errors in it, so no need to continue, or capture error message return False if directory_info.file_count == 0 or directory_info.file_count != len(slide_scan_info): error_msg = "Error: Globus file count does not match expectation" - file_list = self.dlu_file_handler.match_files(package_id) + # No need to calc checksums here, we just need the list of files + file_list = self.dlu_file_handler.match_files(package_id, calculate_checksums=False) expected_slides = [] + slide_name_map = {} for slide in slide_scan_info: expected_slides.append(slide['source_file_name']) + slide_name_map[slide['source_file_name']] = slide['new_file_name'] for file in file_list: if file.name not in expected_slides: error_msg = "Error: Filenames in directory do not match slide_scan_curation info" @@ -149,6 +154,7 @@ def do_wsi_file_renames(self, globus_data_directory: str, package_id: str): self.dlu_management.update_dlu_package(package_id, {"globus_dlu_status": error_msg}) return False + self.dlu_file_handler.rename_files(file_list, slide_name_map,package_id) return True def is_directory_valid(self, directory_info, package_id): From e39ecb9abdee1ab4d4dd7199a024337f246aa071 Mon Sep 17 00:00:00 2001 From: dert1129 Date: Thu, 9 Oct 2025 15:02:09 -0400 Subject: [PATCH 30/54] fix update and select statements --- data_management/services/dlu_management.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data_management/services/dlu_management.py b/data_management/services/dlu_management.py index 0ea5bf9..341118e 100644 --- a/data_management/services/dlu_management.py +++ b/data_management/services/dlu_management.py @@ -104,12 +104,12 @@ def update_dlu_package(self, package_id: str, fields_values: dict): def get_missing_slides(self, redcap_id: str): return self.db.get_data( - "select * from missing_slides_v where spectrack_redcap_record_id = %", - redcap_id,), + "select * from missing_slides_v where spectrack_redcap_record_id = %s", + (redcap_id,)), def update_missing_slides(self, redcap_id: str): return self.db.get_data( - "update slide_scan_curation set missing_slides = 1 where redcap_id = %s", redcap_id, + "update slide_scan_curation set missing_slides = 1 where redcap_id = %s", (redcap_id,) ), def insert_dlu_file(self, values): From 0e78f969c45b937bfd9904e102baae239de2bfe7 Mon Sep 17 00:00:00 2001 From: Becky Reamy Date: Fri, 10 Oct 2025 10:19:29 -0400 Subject: [PATCH 31/54] KPMP-5807: Rename some vars to be snake case --- data_management/services/dlu_filesystem.py | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/data_management/services/dlu_filesystem.py b/data_management/services/dlu_filesystem.py index 052b511..a8a0ae7 100644 --- a/data_management/services/dlu_filesystem.py +++ b/data_management/services/dlu_filesystem.py @@ -194,26 +194,26 @@ def validate_package_directories(self, package_id: str): logger.error("Directory for package " + package_id + " failed validation.") return success - def process_globus_directory(self, directoryListing, globusDirectories: list[DirectoryInfo], packageId, initialDir, - calculate_checksums: bool = True): - for dir in globusDirectories: + def process_globus_directory(self, directory_listing, globus_directories: list[DirectoryInfo], package_id, + initial_dir, calculate_checksums: bool = True): + for dir in globus_directories: prefix = "" - if not initialDir == "": - prefix = initialDir + "/" - currentDir = prefix + os.path.basename(dir.directory_path) + if not initial_dir == "": + prefix = initial_dir + "/" + current_dir = prefix + os.path.basename(dir.directory_path) - globusFiles = [] - globusDirectories = [] + globus_files = [] + globus_directories = [] for item in dir.file_details: if os.path.isdir(item.path): - globusDirectories.append(DirectoryInfo(item.path, calculate_checksums=calculate_checksums)) + globus_directories.append(DirectoryInfo(item.path, calculate_checksums=calculate_checksums)) else: - globusFiles.append(item) - directoryListing[currentDir] = globusFiles - if len(globusDirectories) > 0: - self.process_globus_directory(directoryListing, globusDirectories, packageId, currentDir, + globus_files.append(item) + directory_listing[current_dir] = globus_files + if len(globus_directories) > 0: + self.process_globus_directory(directory_listing, globus_directories, package_id, current_dir, calculate_checksums) - return directoryListing + return directory_listing def match_files(self, package_id: str, calculate_checksums: bool = True) -> list[DLUFile]: top_level_dir = DirectoryInfo(self.globus_data_directory + '/' + self.globus_dir_prefix + package_id, From 0bddcba028fbb1b94b383ac61b7f188fd1e4588f Mon Sep 17 00:00:00 2001 From: dert1129 Date: Fri, 10 Oct 2025 14:11:25 -0400 Subject: [PATCH 32/54] swap around the if statement --- data_management/services/slide_management.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py index 5a630be..01e2e70 100644 --- a/data_management/services/slide_management.py +++ b/data_management/services/slide_management.py @@ -102,19 +102,19 @@ def process_slide_manifest_imports(self): check_missing_slides = self.db.get_missing_slides(redcap_id) if len(check_missing_slides) >= 1: + error_message += "There are missing slide(s) for participant " + redcap_id + ";" + logger.info(error_message) + self.db.update_missing_slides(redcap_id) + + # Can't use record_in_error here because we can't set an error message for an image_id that doesn't exist + self.db.set_error_message_slide_scan_curation_redcap_id(error=error_message, redcap_id=redcap_id) + else: slide_scan = SlideScanModel(image_id=image_id, redcap_id=redcap_id, kit_id=kit_id, new_file_name=new_file_name, source_file_name=source_file_name, source_folder_name=source_folder_name) self.db.insert_into_slide_scan_curation(slide_scan.get_dmd_tuple()) - else: - - error_message += "There are missing slides for participant " + redcap_id + ";" - logger.info(error_message) - self.db.update_missing_slides(redcap_id) - # Can't use record_in_error here because we can't set an error message for an image_id that doesn't exist - self.db.set_error_message_slide_scan_curation_redcap_id(error=error_message, redcap_id=redcap_id) if record_in_error: self.db.set_error_message_slide_scan_curation(image_id=image_id, error=error_message) From 29a0ac62193837e0d137a97751cddb5c43efa9b2 Mon Sep 17 00:00:00 2001 From: dert1129 Date: Mon, 13 Oct 2025 10:17:58 -0400 Subject: [PATCH 33/54] run the package_id filler --- data_management/watch_files.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/data_management/watch_files.py b/data_management/watch_files.py index c6366ce..5192976 100644 --- a/data_management/watch_files.py +++ b/data_management/watch_files.py @@ -113,6 +113,8 @@ def move_packages_to_DLU(self, packages): self.dlu_state.set_package_state(package_id, PackageState.UPLOAD_SUCCEEDED) self.dlu_state.clear_cache() + def fill_in_null_package_ids(self): + self.slide_management.fill_in_package_ids() def do_wsi_file_renames(self, globus_data_directory: str, package_id: str): error_msg = "" From 463fe8a7754338e08c3edd7757c728dcc6f5f6fa Mon Sep 17 00:00:00 2001 From: dert1129 Date: Mon, 13 Oct 2025 13:17:23 -0400 Subject: [PATCH 34/54] skip redcap ids that are null --- data_management/services/slide_management.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py index 01e2e70..1805ce1 100644 --- a/data_management/services/slide_management.py +++ b/data_management/services/slide_management.py @@ -138,6 +138,9 @@ def fill_in_package_ids(self): redcap_id_list = self.db.get_redcap_ids_with_null_package_id() if len(redcap_id_list) != 0: for row in redcap_id_list: + if row['redcap_id'] is None or row['redcap_id'] == "": + logger.info("Skipping null redcap_id") + continue redcap_id = row['redcap_id'] package_id_list = self.db.get_package_ids_for_redcap_id(redcap_id) if None not in package_id_list and len(package_id_list) == 1: From 8dbd84922a110d13a39d8a6d16234bda34ca56fc Mon Sep 17 00:00:00 2001 From: dert1129 Date: Tue, 14 Oct 2025 10:30:20 -0400 Subject: [PATCH 35/54] do not fetch null redcap_ids --- data_management/services/dlu_management.py | 2 +- data_management/services/slide_management.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/data_management/services/dlu_management.py b/data_management/services/dlu_management.py index 780e7bd..c4fc77b 100644 --- a/data_management/services/dlu_management.py +++ b/data_management/services/dlu_management.py @@ -242,7 +242,7 @@ def get_spectrack_redcap_record_id(self, kit_id): def get_redcap_ids_with_null_package_id(self): return self.db.get_data( - "select unique redcap_id from slide_scan_curation where dlu_package_id is null and error_message is null", + "select unique redcap_id from slide_scan_curation where dlu_package_id is null and error_message is null and redcap_id is not null", (None), ) diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py index 1805ce1..01e2e70 100644 --- a/data_management/services/slide_management.py +++ b/data_management/services/slide_management.py @@ -138,9 +138,6 @@ def fill_in_package_ids(self): redcap_id_list = self.db.get_redcap_ids_with_null_package_id() if len(redcap_id_list) != 0: for row in redcap_id_list: - if row['redcap_id'] is None or row['redcap_id'] == "": - logger.info("Skipping null redcap_id") - continue redcap_id = row['redcap_id'] package_id_list = self.db.get_package_ids_for_redcap_id(redcap_id) if None not in package_id_list and len(package_id_list) == 1: From 1ad7cf8252cf9d69496bbf97f4537844e035fd25 Mon Sep 17 00:00:00 2001 From: dert1129 Date: Wed, 15 Oct 2025 14:26:49 -0400 Subject: [PATCH 36/54] insert slides into the curation table anyways, but mark them as missing a slide --- data_management/services/slide_management.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py index 01e2e70..4a3a8ff 100644 --- a/data_management/services/slide_management.py +++ b/data_management/services/slide_management.py @@ -100,6 +100,10 @@ def process_slide_manifest_imports(self): source_folder_name = file_location.parent.name check_missing_slides = self.db.get_missing_slides(redcap_id) + slide_scan = SlideScanModel(image_id=image_id, redcap_id=redcap_id, kit_id=kit_id, + new_file_name=new_file_name, source_file_name=source_file_name, + source_folder_name=source_folder_name) + self.db.insert_into_slide_scan_curation(slide_scan.get_dmd_tuple()) if len(check_missing_slides) >= 1: error_message += "There are missing slide(s) for participant " + redcap_id + ";" @@ -108,13 +112,6 @@ def process_slide_manifest_imports(self): # Can't use record_in_error here because we can't set an error message for an image_id that doesn't exist self.db.set_error_message_slide_scan_curation_redcap_id(error=error_message, redcap_id=redcap_id) - else: - slide_scan = SlideScanModel(image_id=image_id, redcap_id=redcap_id, kit_id=kit_id, - new_file_name=new_file_name, source_file_name=source_file_name, - source_folder_name=source_folder_name) - - self.db.insert_into_slide_scan_curation(slide_scan.get_dmd_tuple()) - if record_in_error: self.db.set_error_message_slide_scan_curation(image_id=image_id, error=error_message) From 3f9824dfdbedadf8a0ac02ea2b36956ae77547d8 Mon Sep 17 00:00:00 2001 From: dert1129 Date: Wed, 15 Oct 2025 14:27:58 -0400 Subject: [PATCH 37/54] shift variable to be stored after insertion --- data_management/services/slide_management.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py index 4a3a8ff..7a9ad60 100644 --- a/data_management/services/slide_management.py +++ b/data_management/services/slide_management.py @@ -99,14 +99,14 @@ def process_slide_manifest_imports(self): source_file_name = file_location.name source_folder_name = file_location.parent.name - check_missing_slides = self.db.get_missing_slides(redcap_id) slide_scan = SlideScanModel(image_id=image_id, redcap_id=redcap_id, kit_id=kit_id, new_file_name=new_file_name, source_file_name=source_file_name, source_folder_name=source_folder_name) self.db.insert_into_slide_scan_curation(slide_scan.get_dmd_tuple()) + check_missing_slides = self.db.get_missing_slides(redcap_id) if len(check_missing_slides) >= 1: - error_message += "There are missing slide(s) for participant " + redcap_id + ";" + error_message += "There are missing slide(s) for participant " + redcap_id + "; " logger.info(error_message) self.db.update_missing_slides(redcap_id) From c6a94d9c1ceb96dacb0b1a3604329019c7cad455 Mon Sep 17 00:00:00 2001 From: dert1129 Date: Thu, 16 Oct 2025 15:03:41 -0400 Subject: [PATCH 38/54] log the error message to the console --- data_management/services/slide_management.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py index 7a9ad60..5c86e0d 100644 --- a/data_management/services/slide_management.py +++ b/data_management/services/slide_management.py @@ -80,6 +80,11 @@ def process_slide_manifest_imports(self): kit_id = record["outside_acc"] image_id = record["image_id"] redcap_id = self.db.get_spectrack_redcap_record_id(kit_id) + if redcap_id is None: + error_message = "No redcap_id found for kit_id " + kit_id + "; " + logger.error(error_message) + continue + if record["accession"] is not None: new_file_name = self.determine_new_slide_name(sample_id=record["accession"], kit_id=kit_id, stain_info=record["stain"], block_id=record["block_id"]) @@ -105,8 +110,11 @@ def process_slide_manifest_imports(self): self.db.insert_into_slide_scan_curation(slide_scan.get_dmd_tuple()) check_missing_slides = self.db.get_missing_slides(redcap_id) if len(check_missing_slides) >= 1: - - error_message += "There are missing slide(s) for participant " + redcap_id + "; " + if error_message != None: + + error_message += "There are missing slide(s) for participant " + redcap_id + "; " + elif error_message is None: + error_message = "There are missing slide(s) for participant " + redcap_id + "; " logger.info(error_message) self.db.update_missing_slides(redcap_id) From c348acd3dd0d0fb7e3c943d1813cb19d83502ec5 Mon Sep 17 00:00:00 2001 From: dert1129 Date: Thu, 16 Oct 2025 15:03:51 -0400 Subject: [PATCH 39/54] remove where condition --- data_management/services/dlu_management.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_management/services/dlu_management.py b/data_management/services/dlu_management.py index c4fc77b..04a62f6 100644 --- a/data_management/services/dlu_management.py +++ b/data_management/services/dlu_management.py @@ -230,7 +230,7 @@ def get_equal_num_rows(self): def get_new_slide_manifest_import_rows(self): return self.db.get_data("SELECT * FROM slide_manifest_import WHERE image_id NOT IN " - "(SELECT image_id FROM slide_scan_curation where missing_slides = 0)") + "(SELECT image_id FROM slide_scan_curation)") def get_spectrack_redcap_record_id(self, kit_id): result = self.db.get_data("SELECT spectrack_redcap_record_id FROM spectrack_specimen " From 4e31e6000b82d62598402612c8faeae0a2e64f21 Mon Sep 17 00:00:00 2001 From: dert1129 Date: Thu, 23 Oct 2025 13:24:45 -0400 Subject: [PATCH 40/54] use all() --- data_management/services/slide_management.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py index 5c86e0d..6d3fd87 100644 --- a/data_management/services/slide_management.py +++ b/data_management/services/slide_management.py @@ -109,7 +109,7 @@ def process_slide_manifest_imports(self): source_folder_name=source_folder_name) self.db.insert_into_slide_scan_curation(slide_scan.get_dmd_tuple()) check_missing_slides = self.db.get_missing_slides(redcap_id) - if len(check_missing_slides) >= 1: + if not all(check_missing_slides): if error_message != None: error_message += "There are missing slide(s) for participant " + redcap_id + "; " From f37534ceeadd6488f136406b6a38540171090e1a Mon Sep 17 00:00:00 2001 From: dert1129 Date: Mon, 27 Oct 2025 10:35:30 -0400 Subject: [PATCH 41/54] invert if statement --- data_management/services/slide_management.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py index 6d3fd87..da557b4 100644 --- a/data_management/services/slide_management.py +++ b/data_management/services/slide_management.py @@ -109,7 +109,7 @@ def process_slide_manifest_imports(self): source_folder_name=source_folder_name) self.db.insert_into_slide_scan_curation(slide_scan.get_dmd_tuple()) check_missing_slides = self.db.get_missing_slides(redcap_id) - if not all(check_missing_slides): + if all(check_missing_slides): if error_message != None: error_message += "There are missing slide(s) for participant " + redcap_id + "; " From d8c87c528dcd3e1e5a8c622b94db19a0da0bac9c Mon Sep 17 00:00:00 2001 From: dert1129 Date: Mon, 27 Oct 2025 10:46:47 -0400 Subject: [PATCH 42/54] added a logger to tell the dev when imports are done --- data_management/services/slide_management.py | 1 + 1 file changed, 1 insertion(+) diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py index da557b4..d228fe6 100644 --- a/data_management/services/slide_management.py +++ b/data_management/services/slide_management.py @@ -123,6 +123,7 @@ def process_slide_manifest_imports(self): if record_in_error: self.db.set_error_message_slide_scan_curation(image_id=image_id, error=error_message) + logger.info("Processed " + str(len(new_records)) + " new slide_manifest_import records.") def determine_new_slide_name(self, sample_id: str, kit_id: str, stain_info: str, block_id: str): slides_for_kit = self.db.get_slide_manifest_import_by_kit(kit_id, stain_info) From 2aa4f2b22869030fab08afe7827cfa9cb27cfed9 Mon Sep 17 00:00:00 2001 From: rlreamy <34109594+rlreamy@users.noreply.github.com> Date: Thu, 30 Oct 2025 14:11:16 -0400 Subject: [PATCH 43/54] Update dlu_management.py Fix the conditions for getting a package id for slides --- data_management/services/dlu_management.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_management/services/dlu_management.py b/data_management/services/dlu_management.py index 04a62f6..635ca82 100644 --- a/data_management/services/dlu_management.py +++ b/data_management/services/dlu_management.py @@ -248,7 +248,7 @@ def get_redcap_ids_with_null_package_id(self): def get_package_ids_for_redcap_id(self, redcap_id): return self.db.get_data( - "select dlu_package_id from dlu_package_inventory where dlu_subject_id = %s and globus_dlu_status = 'success'", (redcap_id,) + "select dlu_package_id from dlu_package_inventory where dlu_subject_id = %s and globus_dlu_status IS NULL", (redcap_id,) ) def update_package_ids_in_slide_scan_curation(self, redcap_id, package_id): From f5c842c902bced21fa3fa55a56ad2210f4839034 Mon Sep 17 00:00:00 2001 From: Becky Reamy Date: Wed, 12 Nov 2025 14:54:26 -0500 Subject: [PATCH 44/54] KPMP-6260: Check for missing slides after inserting to ensure we are all up to date --- data_management/services/dlu_management.py | 28 +++++++++--------- data_management/services/slide_management.py | 30 ++++++++++++-------- data_management/watch_files.py | 1 + 3 files changed, 34 insertions(+), 25 deletions(-) diff --git a/data_management/services/dlu_management.py b/data_management/services/dlu_management.py index 04a62f6..e4a2fdb 100644 --- a/data_management/services/dlu_management.py +++ b/data_management/services/dlu_management.py @@ -101,11 +101,6 @@ def update_dlu_package(self, package_id: str, fields_values: dict): values = query_info["values"][0:] + (package_id,) query = "UPDATE data_manager_data_v SET " + query_info["set_clause"] + " WHERE dlu_package_id = %s" self.db.insert_data(query, values) - - def get_missing_slides(self, redcap_id: str): - return self.db.get_data( - "select * from missing_slides_v where spectrack_redcap_record_id = %s", - (redcap_id,)), def update_missing_slides(self, redcap_id: str): return self.db.get_data( @@ -270,27 +265,34 @@ def get_slide_manifest_import_by_kit(self, kit_id, stain): def set_error_message_slide_scan_curation(self, error, image_id): self.db.insert_data("UPDATE slide_scan_curation set error_message = %s where image_id = %s", (error, image_id,)) - - def set_error_message_slide_scan_curation_redcap_id(self, error, redcap_id): - self.db.insert_data_no_alert("UPDATE slide_scan_curation set error_message = %s where redcap_id = %s", - (error, redcap_id,)) def find_slide_scan_info_by_package_id(self, package_id): - self.db.get_data("SELECT * FROM slide_scan_v WHERE dlu_package_id = %s", + return self.db.get_data("SELECT * FROM slide_scan_v WHERE dlu_package_id = %s", (package_id,)) def is_package_missing_slides(self, package_id): - self.db.get_data("SELECT * FROM slide_scan_v WHERE dlu_package_id = %s and missing_slides = 1", + return self.db.get_data("SELECT * FROM slide_scan_v WHERE dlu_package_id = %s and missing_slides = 1", (package_id,)) + def slides_marked_missing_by_redcap_id(self, redcap_id: str): + return self.db.get_data("SELECT * FROM slide_scan_v WHERE redcap_id = %s AND missing_slides = 1", + (redcap_id,)) + + def get_missing_slides_from_view(self, redcap_id: str): + return self.db.get_data("select * from missing_slides_v where spectrack_redcap_record_id = %s", + (redcap_id,)) + def is_slides_in_error(self, package_id): - self.db.get_data("SELECT * FROM slide_scan_curation WHERE dlu_package_id = %s and error_message IS NOT NULL", + return self.db.get_data("SELECT * FROM slide_scan_curation WHERE dlu_package_id = %s and error_message IS NOT NULL", (package_id,)) def find_not_approved_filenames(self, package_id): - self.db.get_data("SELECT * FROM slide_scan_curation WHERE approve_file_name = 'yes' AND dlu_package_id = %s", + return self.db.get_data("SELECT * FROM slide_scan_curation WHERE approve_file_name = 'yes' AND dlu_package_id = %s", (package_id,)) + def update_missing_slide_flag(self, image_id): + return self.db.insert_data("UPDATE slide_scan_curation SET missing_slides = 0 WHERE image_id = %s", + (image_id,)) if __name__ == "__main__": dlu_management = DluManagement() diff --git a/data_management/services/slide_management.py b/data_management/services/slide_management.py index 6d3fd87..72cf473 100644 --- a/data_management/services/slide_management.py +++ b/data_management/services/slide_management.py @@ -74,6 +74,7 @@ def __init__(self, db): def process_slide_manifest_imports(self): new_records = self.db.get_new_slide_manifest_import_rows() + redcap_ids_processed = [] for record in new_records: record_in_error = False error_message = "" @@ -84,7 +85,7 @@ def process_slide_manifest_imports(self): error_message = "No redcap_id found for kit_id " + kit_id + "; " logger.error(error_message) continue - + if record["accession"] is not None: new_file_name = self.determine_new_slide_name(sample_id=record["accession"], kit_id=kit_id, stain_info=record["stain"], block_id=record["block_id"]) @@ -108,22 +109,27 @@ def process_slide_manifest_imports(self): new_file_name=new_file_name, source_file_name=source_file_name, source_folder_name=source_folder_name) self.db.insert_into_slide_scan_curation(slide_scan.get_dmd_tuple()) - check_missing_slides = self.db.get_missing_slides(redcap_id) + check_missing_slides = self.db.get_missing_slides_from_view(redcap_id) + redcap_ids_processed.append(redcap_id) if not all(check_missing_slides): - if error_message != None: - - error_message += "There are missing slide(s) for participant " + redcap_id + "; " - elif error_message is None: - error_message = "There are missing slide(s) for participant " + redcap_id + "; " - logger.info(error_message) self.db.update_missing_slides(redcap_id) - - # Can't use record_in_error here because we can't set an error message for an image_id that doesn't exist - self.db.set_error_message_slide_scan_curation_redcap_id(error=error_message, redcap_id=redcap_id) - + if record_in_error: self.db.set_error_message_slide_scan_curation(image_id=image_id, error=error_message) + for redcap_id in redcap_ids_processed: + self.update_missing_slides(redcap_id) + + def update_missing_slides(self, redcap_id: str): + # This MAY seem redundant, however this will ensure that we unmark any missing slides records that just got + # the missing one added + missing_slides = self.db.get_missing_slides_from_view(redcap_id) + if not missing_slides or len(missing_slides) ==0 : + slides_marked_missing = self.db.slides_marked_missing_by_redcap_id(redcap_id) + if slides_marked_missing and len(slides_marked_missing) > 0: + for slide in slides_marked_missing: + self.db.update_missing_slide_flag(slide['image_id']) + def determine_new_slide_name(self, sample_id: str, kit_id: str, stain_info: str, block_id: str): slides_for_kit = self.db.get_slide_manifest_import_by_kit(kit_id, stain_info) diff --git a/data_management/watch_files.py b/data_management/watch_files.py index 5192976..72469ca 100644 --- a/data_management/watch_files.py +++ b/data_management/watch_files.py @@ -113,6 +113,7 @@ def move_packages_to_DLU(self, packages): self.dlu_state.set_package_state(package_id, PackageState.UPLOAD_SUCCEEDED) self.dlu_state.clear_cache() + def fill_in_null_package_ids(self): self.slide_management.fill_in_package_ids() From d2d20789f88a82549b10ae2a6ffe93031b74bcce Mon Sep 17 00:00:00 2001 From: Becky Reamy Date: Thu, 4 Dec 2025 10:58:29 -0500 Subject: [PATCH 45/54] KPMP-5807: Bunch of fixes to get happy path working --- data_management/DluWatcher | 2 + data_management/services/dlu_filesystem.py | 22 +++++- data_management/services/dlu_management.py | 14 ++-- data_management/watch_files.py | 84 +++++++++++++++------- 4 files changed, 85 insertions(+), 37 deletions(-) diff --git a/data_management/DluWatcher b/data_management/DluWatcher index 4818e3f..c44ab9e 100644 --- a/data_management/DluWatcher +++ b/data_management/DluWatcher @@ -1,5 +1,7 @@ FROM python:3.10-slim-bullseye +USER root + COPY requirements.txt ./ RUN pip3 install --progress-bar off --no-cache-dir -r requirements.txt diff --git a/data_management/services/dlu_filesystem.py b/data_management/services/dlu_filesystem.py index a8a0ae7..f1b9d44 100644 --- a/data_management/services/dlu_filesystem.py +++ b/data_management/services/dlu_filesystem.py @@ -8,6 +8,7 @@ from zarr_checksum import compute_zarr_checksum from zarr_checksum.generators import yield_files_local from mmap import mmap, ACCESS_READ +import subprocess logger = logging.getLogger("DLUFilesystem") logger.setLevel(logging.INFO) @@ -114,11 +115,26 @@ def chown_dir(self, package_id: str, files: list[DLUFile], user_id): if os.stat(subdir_path).st_uid != user_id or os.stat(subdir_path).st_gid != int(os.environ['dlu_group']): os.chown(subdir_path, user_id, int(os.environ['dlu_group'])) - def rename_files(self, file_list: list[DLUFile], slide_name_map, package_id ): + def rename_and_move_files(self, file_list: list[DLUFile], slide_name_map, package_id ): + dluFiles = [] + dest_package_directory = os.path.join(self.dlu_data_directory, self.dlu_package_dir_prefix + package_id) + if os.path.exists(dest_package_directory): + shutil.rmtree(dest_package_directory) + if not os.path.exists(dest_package_directory): + logger.info("Creating directory " + dest_package_directory) + os.makedirs(dest_package_directory, exist_ok=True) + source_package_directory = self.globus_data_directory + '/' + self.globus_dir_prefix + package_id for file in file_list: - os.rename(os.path.join(source_package_directory, file.name), - os.path.join(source_package_directory, slide_name_map[file.name])) + dest_file = os.path.join(dest_package_directory, slide_name_map[file.name]) + logger.info("Copying file " + os.path.join(source_package_directory, file.name) + " to " + + os.path.join(dest_package_directory, slide_name_map[file.name])) + shutil.copy(os.path.join(source_package_directory, file.name), + dest_file) + file = DLUFile(name=slide_name_map[file.name], path=dest_package_directory, + checksum=calculate_checksum(dest_file), size=os.path.getsize(dest_file)) + dluFiles.append(file) + return dluFiles def copy_files(self, package_id: str, file_list: list[DLUFile], preserve_path: bool = False, no_src_package: bool = False): files_copied = 0 diff --git a/data_management/services/dlu_management.py b/data_management/services/dlu_management.py index 6ccf2e8..e5eab61 100644 --- a/data_management/services/dlu_management.py +++ b/data_management/services/dlu_management.py @@ -160,7 +160,7 @@ def find_all_files(self): ) def update_md5(self, file_id: str, checksum: str, package_id: str): - self.db.insert_data("UPDATE dlu_file SET dlu_md5checksum = %s WHERE dlu_file_id = %s and dlu_package_id = %s", + return self.db.insert_data("UPDATE dlu_file SET dlu_md5checksum = %s WHERE dlu_file_id = %s and dlu_package_id = %s", (checksum, file_id,package_id)) def move_globus_files_to_dlu(self, package_id: str): @@ -252,27 +252,27 @@ def get_slide_manifest_import_by_kit(self, kit_id, stain): (kit_id,stain,)) def set_error_message_slide_scan_curation(self, error, image_id): - self.db.insert_data("UPDATE slide_scan_curation set error_message = %s where image_id = %s", + return self.db.insert_data("UPDATE slide_scan_curation set error_message = %s where image_id = %s", (error, image_id,)) def set_error_message_slide_scan_curation_redcap_id(self, error, redcap_id): - self.db.insert_data_no_alert("UPDATE slide_scan_curation set error_message = %s where redcap_id = %s", + return self.db.insert_data_no_alert("UPDATE slide_scan_curation set error_message = %s where redcap_id = %s", (error, redcap_id,)) def find_slide_scan_info_by_package_id(self, package_id): - self.db.get_data("SELECT * FROM slide_scan_v WHERE dlu_package_id = %s", + return self.db.get_data("SELECT * FROM slide_scan_v WHERE dlu_package_id = %s", (package_id,)) def is_package_missing_slides(self, package_id): - self.db.get_data("SELECT * FROM slide_scan_v WHERE dlu_package_id = %s and missing_slides = 1", + return self.db.get_data("SELECT * FROM slide_scan_v WHERE dlu_package_id = %s and missing_slides = 1", (package_id,)) def is_slides_in_error(self, package_id): - self.db.get_data("SELECT * FROM slide_scan_curation WHERE dlu_package_id = %s and error_message IS NOT NULL", + return self.db.get_data("SELECT * FROM slide_scan_curation WHERE dlu_package_id = %s and error_message IS NOT NULL", (package_id,)) def find_not_approved_filenames(self, package_id): - self.db.get_data("SELECT * FROM slide_scan_curation WHERE approve_file_name = 'yes' AND dlu_package_id = %s", + return self.db.get_data("SELECT * FROM slide_scan_curation WHERE approve_file_name = 'yes' AND dlu_package_id = %s", (package_id,)) diff --git a/data_management/watch_files.py b/data_management/watch_files.py index df757ad..cae3fd3 100644 --- a/data_management/watch_files.py +++ b/data_management/watch_files.py @@ -75,7 +75,9 @@ def pickup_waiting_packages(self): def move_packages_to_DLU(self, packages): file_list = None + for _, package in enumerate(packages): + skip_copy = False package_id = package['dlu_package_id'] logger.info("Moving package " + package_id) @@ -91,52 +93,62 @@ def move_packages_to_DLU(self, packages): success = self.do_wsi_file_renames(globus_data_directory, package_id) if not success: continue + else: + skip_copy = True - # We do end up doing this check twice for WSIs but we are modifying the filenames, so it is probably good - directory_info = DirectoryInfo(globus_data_directory) - if not self.is_directory_valid(directory_info, package_id): - continue + if not skip_copy: + directory_info = DirectoryInfo(globus_data_directory) + if not self.is_directory_valid(directory_info, package_id): + continue - if directory_info.file_count == 0 and directory_info.subdir_count == 1: - contents = "".join(directory_info.dir_contents) - top_level_subdir = package_id + "/" + contents - file_list = self.dlu_file_handler.match_files(top_level_subdir) - else: - file_list = self.dlu_file_handler.match_files(package_id) - - self.dlu_file_handler.copy_files(package_id, self.process_file_paths(directory_info.file_details)) - self.dlu_file_handler.chown_dir(package_id, file_list, int(os.environ['dlu_user'])) - file_info = self.dlu_management.insert_dlu_files(package_id, file_list) - self.dlu_management.update_dlu_package(package_id, { "globus_dlu_status": "success" }) - self.dlu_management.update_dlu_package(package_id, { "ready_to_move_from_globus": "done" }) - self.dlu_mongo.update_package_files(package_id, file_info) - - self.dlu_state.set_package_state(package_id, PackageState.UPLOAD_SUCCEEDED) - self.dlu_state.clear_cache() + if directory_info.file_count == 0 and directory_info.subdir_count == 1: + contents = "".join(directory_info.dir_contents) + top_level_subdir = package_id + "/" + contents + file_list = self.dlu_file_handler.match_files(top_level_subdir) + else: + file_list = self.dlu_file_handler.match_files(package_id) + + self.dlu_file_handler.copy_files(package_id, self.process_file_paths(directory_info.file_details)) + self.dlu_file_handler.chown_dir(package_id, file_list, int(os.environ['dlu_user'])) + file_info = self.dlu_management.insert_dlu_files(package_id, file_list) + self.dlu_management.update_dlu_package(package_id, { "globus_dlu_status": "success" }) + self.dlu_management.update_dlu_package(package_id, { "ready_to_move_from_globus": "done" }) + self.dlu_mongo.update_package_files(package_id, file_info) + + self.dlu_state.set_package_state(package_id, PackageState.UPLOAD_SUCCEEDED) + self.dlu_state.clear_cache() def do_wsi_file_renames(self, globus_data_directory: str, package_id: str): + logger.info("starting rename process") error_msg = "" slide_scan_info = self.dlu_management.find_slide_scan_info_by_package_id(package_id) if slide_scan_info is None or len(slide_scan_info) == 0: - error_msg = "Error: Package not found in slide_scan_v" + self.log_err_message_slide_rename("Error: Package not found in slide_scan_v", package_id) + return False missing_slides = self.dlu_management.is_package_missing_slides(package_id) if missing_slides is not None and len(missing_slides) > 0: - error_msg = "Error: Package is missing slides" + self.log_err_message_slide_rename( "Error: Package is missing slides", package_id) + return False + slides_in_error = self.dlu_management.is_slides_in_error(package_id) if slides_in_error is not None and len(slides_in_error) > 0: - error_msg = "Error: Package has some slides in error" + self.log_err_message_slide_rename("Error: Package has some slides in error", package_id) + return False + unapproved_files = self.dlu_management.find_not_approved_filenames(package_id) if unapproved_files is not None and len(unapproved_files) > 0: - error_msg = "Error: Package has unapproved filenames" + self.log_err_message_slide_rename("Error: Package has unapproved filenames", package_id) + return False - directory_info = DirectoryInfo(globus_data_directory) + directory_info = DirectoryInfo(globus_data_directory, calculate_checksums=False) if not self.is_directory_valid(directory_info, package_id): # This method logs errors in it, so no need to continue, or capture error message return False if directory_info.file_count == 0 or directory_info.file_count != len(slide_scan_info): - error_msg = "Error: Globus file count does not match expectation" + self.log_err_message_slide_rename("Error: Globus file count does not match expectation", package_id) + return False # No need to calc checksums here, we just need the list of files file_list = self.dlu_file_handler.match_files(package_id, calculate_checksums=False) @@ -154,15 +166,33 @@ def do_wsi_file_renames(self, globus_data_directory: str, package_id: str): self.dlu_management.update_dlu_package(package_id, {"globus_dlu_status": error_msg}) return False - self.dlu_file_handler.rename_files(file_list, slide_name_map,package_id) + copied_files = self.dlu_file_handler.rename_and_move_files(file_list, slide_name_map, package_id) + if len(copied_files) == 0: + return False + + self.dlu_file_handler.chown_dir(package_id, copied_files, int(os.environ['dlu_user'])) + file_info = self.dlu_management.insert_dlu_files(package_id=package_id, file_list=copied_files) + self.dlu_management.update_dlu_package(package_id, {"globus_dlu_status": "success"}) + self.dlu_management.update_dlu_package(package_id, {"ready_to_move_from_globus": "done"}) + self.dlu_mongo.update_package_files(package_id, file_info) + + self.dlu_state.set_package_state(package_id, PackageState.UPLOAD_SUCCEEDED) + self.dlu_state.clear_cache() + return True + def log_err_message_slide_rename(self, error_msg, package_id): + logger.error(error_msg + " for package: " + package_id) + self.dlu_management.update_dlu_package(package_id, {"globus_dlu_status": error_msg}) + def is_directory_valid(self, directory_info, package_id): + logger.info("checking if directory is valid") if directory_info.file_count == 0 and directory_info.subdir_count == 0: error_msg = "Error: package " + package_id + " has no files or top level subdirectory" logger.info(error_msg + " Skipping.") self.dlu_management.update_dlu_package(package_id, {"globus_dlu_status": error_msg}) return False + return True if __name__ == "__main__": From e812218e8548267fd2baa362f679589dff1157a6 Mon Sep 17 00:00:00 2001 From: zwright Date: Fri, 30 Jan 2026 09:51:42 -0500 Subject: [PATCH 46/54] KPMP-6545: load biopsy tracking long table --- data_management/main.py | 4 ++++ data_management/services/dlu_management.py | 6 ++++++ data_management/services/tableau.py | 10 ++++++++++ 3 files changed, 20 insertions(+) diff --git a/data_management/main.py b/data_management/main.py index 9f47c8e..c637dcd 100644 --- a/data_management/main.py +++ b/data_management/main.py @@ -43,6 +43,9 @@ def upsert_new_spectrack_specimens(self): def load_biopsy_tracking(self): return self.tableau.load_biopsy_tracking() + def load_biopsy_tracking_long(self): + return self.tableau.load_biopsy_tracking_long() + def load_data_manager_data(self): return self.tableau.load_data_manager_data() @@ -88,6 +91,7 @@ def update_biomarker_tracking_redcap_ids(self): if args.action == "insert" or args.action == "update": records_modified = main.load_biopsy_tracking() records_modified = records_modified + main.load_data_manager_data() + records_modified = records_modified + main.load_biopsy_tracking_long() if "records_modified" in locals(): logger.info(f"{records_modified} records modified") diff --git a/data_management/services/dlu_management.py b/data_management/services/dlu_management.py index 8b52a16..1bb0dc0 100644 --- a/data_management/services/dlu_management.py +++ b/data_management/services/dlu_management.py @@ -191,6 +191,12 @@ def get_biopsy_tracking(self): ) return result + def get_biopsy_tracking_long(self): + result = self.db.get_data( + "select * from biopsy_tracking_long_v" + ) + return result + def get_data_manager_data(self): result = self.db.get_data( """ diff --git a/data_management/services/tableau.py b/data_management/services/tableau.py index ec05b17..da8f81d 100644 --- a/data_management/services/tableau.py +++ b/data_management/services/tableau.py @@ -31,6 +31,16 @@ def load_biopsy_tracking(self): records_modified = records_modified + 1 return records_modified + def load_biopsy_tracking_long(self): + self.truncate_biopsy_tracking() + bt_results = self.dlu_management.get_biopsy_tracking_long() + query = "INSERT INTO biopsy_tracking_long(redcap_id, specimen_id, dlu_packageType, status) VALUES(%s, %s, %s, %s)" + records_modified = 0 + for result in bt_results: + insert_result = self.db_tableau.insert_data(query, tuple(result.values())) + records_modified = records_modified + 1 + return records_modified + def load_data_manager_data(self): self.truncate_data_manager_data() results = self.dlu_management.get_data_manager_data() From 5f350d742b72fd9b56190e07a70e9396f5f408e9 Mon Sep 17 00:00:00 2001 From: zwright Date: Fri, 30 Jan 2026 10:24:40 -0500 Subject: [PATCH 47/54] KPMP-6545: biopsy tracking long table SQL --- data_management/sql/biopsy_tracking_long.sql | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 data_management/sql/biopsy_tracking_long.sql diff --git a/data_management/sql/biopsy_tracking_long.sql b/data_management/sql/biopsy_tracking_long.sql new file mode 100644 index 0000000..3b86ee2 --- /dev/null +++ b/data_management/sql/biopsy_tracking_long.sql @@ -0,0 +1,9 @@ +CREATE TABLE biopsy_tracking_long +( + redcap_id varchar(100) NULL, + specimen_id varchar(100) NULL, + dlu_packageType varchar(100) NULL, + status varchar(100) NULL +) ENGINE=InnoDB +DEFAULT CHARSET=latin1 +COLLATE=latin1_swedish_ci; \ No newline at end of file From 480b66ffcdef4224646c5930af0549457dc33899 Mon Sep 17 00:00:00 2001 From: zwright Date: Fri, 30 Jan 2026 10:30:29 -0500 Subject: [PATCH 48/54] KPMP-6545: truncate correct table --- data_management/services/tableau.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_management/services/tableau.py b/data_management/services/tableau.py index da8f81d..8863078 100644 --- a/data_management/services/tableau.py +++ b/data_management/services/tableau.py @@ -32,7 +32,7 @@ def load_biopsy_tracking(self): return records_modified def load_biopsy_tracking_long(self): - self.truncate_biopsy_tracking() + self.truncate_biopsy_tracking_long() bt_results = self.dlu_management.get_biopsy_tracking_long() query = "INSERT INTO biopsy_tracking_long(redcap_id, specimen_id, dlu_packageType, status) VALUES(%s, %s, %s, %s)" records_modified = 0 From 31fc4ede09be167c70693733d1a078dc7ff63471 Mon Sep 17 00:00:00 2001 From: zwright Date: Fri, 30 Jan 2026 10:32:49 -0500 Subject: [PATCH 49/54] KPMP-6545: remove unused results --- data_management/services/tableau.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/data_management/services/tableau.py b/data_management/services/tableau.py index 8863078..92f10cb 100644 --- a/data_management/services/tableau.py +++ b/data_management/services/tableau.py @@ -27,7 +27,7 @@ def load_biopsy_tracking(self): query = "INSERT INTO biopsy_tracking (redcap_record_id, `Whole Slide Images`, `Single-nucleus RNA-Seq Status`, `Single-nucleus RNA-Seq Specimen ID`, `ATAC RNA-seq Status`, `ATAC RNA-seq Specimen ID`, `Single-cell RNA-Seq Status`, `Single-cell RNA-Seq Specimen ID`, `Regional Transcriptomics Status`, `Regional Transcriptomics Specimen ID`, `Bulk total/mRNA Experiment Status`, `Bulk total/mRNA Experiment Specimen ID`, `3D Tissue Imaging and Cytometry Experiment Status`, `3D Tissue Imaging and Cytometry Experiment Specimen ID`, `Regional Proteomics Experiment Status`, `Regional Proteomics Specimen ID`, `Spatial Metabolomics Experiment Status`, `Spatial Metabolomics Specimen ID`, `Spatial Lipidomics Experiment Status`, `Spatial Lipidomics Specimen ID`, `Spatial N-glycomics Experiment Status`, `Spatial N-glycomics Specimen ID`, `Spatial Transcriptomics Experiment Status`, `Spatial Transcriptomics Specimen ID`, `CODEX (IU) Experiment Status`, `CODEX (IU) Specimen ID`, `CODEX (UCSF) Experiment Status`, `CODEX (UCSF) Specimen ID`, `IMC Experiment Status`, `IMC Specimen ID`, `DNA Methyl-seq Experiment Status`, `DNA Methyl-seq Specimen ID`, `CUT & RUN Experiment Status`, `CUT & RUN Specimen ID`, `Metabolon Timed Urine - UHPLC MS-MS Experiment Status`, `Metabolon Timed Urine - UHPLC MS-MS Specimen ID`, `Metabolon Plasma EDTA - UHPLC MS-MS Experiment Status`, `Metabolon Plasma EDTA - UHPLC MS-MS Specimen ID`, `MSDQ120 Spot Urine Biomarker Status`, `MSDQ120 Spot Urine Biomarker Specimen ID`, `MSDQ120 Plasma EDTA Biomarker Status`, `MSDQ120 Plasma EDTA Biomarker Specimen ID`, `Litholink Timed Urine - BCAU680 - Status`, `Litholink Timed Urine - BCAU680 - Specimen ID`, `Stool Microbiome - Qaigen NextEra Status`, `Stool Microbiome - Qaigen NextEra Specimen ID`, `Clinical Chemistry Spot/Timed Urine - BCAU5812 Status`, `Clinical Chemistry Spot/Timed Urine - BCAU5812 Specimen ID`, `Clinical Chemistry Serum - BCAU5812 Status`, `Clinical Chemistry Serum - BCAU5812 Specimen ID`, `SomaScan Plasma EDTA - Status`, `SomaScan Plasma EDTA - Specimen ID`, `SomaScan Spot Urine - Status`, `SomaScan Spot Urine - Specimen ID`, `Descriptor Scoring (TIV)`, `Segmentation/Features Data - Status`, `fMRI - Status`, `Retinal - Status`) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" records_modified = 0 for result in bt_results: - insert_result = self.db_tableau.insert_data(query, tuple(result.values())) + self.db_tableau.insert_data(query, tuple(result.values())) records_modified = records_modified + 1 return records_modified @@ -37,7 +37,7 @@ def load_biopsy_tracking_long(self): query = "INSERT INTO biopsy_tracking_long(redcap_id, specimen_id, dlu_packageType, status) VALUES(%s, %s, %s, %s)" records_modified = 0 for result in bt_results: - insert_result = self.db_tableau.insert_data(query, tuple(result.values())) + self.db_tableau.insert_data(query, tuple(result.values())) records_modified = records_modified + 1 return records_modified @@ -49,7 +49,7 @@ def load_data_manager_data(self): records_modified = 0 for result in results: result["dlu_created"] = result["dlu_created"].strftime('%Y-%m-%d %H:%M:%S') - insert_result = self.db_tableau.insert_data(query, tuple(result.values())) + self.db_tableau.insert_data(query, tuple(result.values())) records_modified = records_modified + 1 return records_modified From a5eb85ff7cb9a80a51a8e147cfdaf8ba79a45630 Mon Sep 17 00:00:00 2001 From: zwright Date: Tue, 3 Feb 2026 16:25:36 -0500 Subject: [PATCH 50/54] KPMP-6545: remove unused results --- data_management/services/tableau.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/data_management/services/tableau.py b/data_management/services/tableau.py index 92f10cb..4265845 100644 --- a/data_management/services/tableau.py +++ b/data_management/services/tableau.py @@ -15,6 +15,12 @@ def truncate_biopsy_tracking(self): ) return result + def truncate_biopsy_tracking_long(self): + result = self.db_tableau.get_data( + "truncate table biopsy_tracking_long" + ) + return result + def truncate_data_manager_data(self): result = self.db_tableau.get_data( "truncate table data_manager_data" From d84acc446ed6fa86198037a9607d2271e9a3750b Mon Sep 17 00:00:00 2001 From: zwright Date: Wed, 4 Feb 2026 13:38:27 -0500 Subject: [PATCH 51/54] KPMP-6545: add tis --- data_management/services/tableau.py | 2 +- data_management/sql/biopsy_tracking_long.sql | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/data_management/services/tableau.py b/data_management/services/tableau.py index 4265845..ce356d6 100644 --- a/data_management/services/tableau.py +++ b/data_management/services/tableau.py @@ -40,7 +40,7 @@ def load_biopsy_tracking(self): def load_biopsy_tracking_long(self): self.truncate_biopsy_tracking_long() bt_results = self.dlu_management.get_biopsy_tracking_long() - query = "INSERT INTO biopsy_tracking_long(redcap_id, specimen_id, dlu_packageType, status) VALUES(%s, %s, %s, %s)" + query = "INSERT INTO biopsy_tracking_long(redcap_id, specimen_id, dlu_tis, dlu_packageType, status) VALUES(%s, %s, %s, %s)" records_modified = 0 for result in bt_results: self.db_tableau.insert_data(query, tuple(result.values())) diff --git a/data_management/sql/biopsy_tracking_long.sql b/data_management/sql/biopsy_tracking_long.sql index 3b86ee2..e302495 100644 --- a/data_management/sql/biopsy_tracking_long.sql +++ b/data_management/sql/biopsy_tracking_long.sql @@ -2,6 +2,7 @@ CREATE TABLE biopsy_tracking_long ( redcap_id varchar(100) NULL, specimen_id varchar(100) NULL, + dlu_tis varchar(100) NULL, dlu_packageType varchar(100) NULL, status varchar(100) NULL ) ENGINE=InnoDB From 2e4886461508364eb517f1c250fe859f7ffe14e6 Mon Sep 17 00:00:00 2001 From: zwright Date: Wed, 4 Feb 2026 13:49:12 -0500 Subject: [PATCH 52/54] KPMP-6545: forgot format string --- data_management/services/tableau.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_management/services/tableau.py b/data_management/services/tableau.py index ce356d6..0c678fe 100644 --- a/data_management/services/tableau.py +++ b/data_management/services/tableau.py @@ -40,7 +40,7 @@ def load_biopsy_tracking(self): def load_biopsy_tracking_long(self): self.truncate_biopsy_tracking_long() bt_results = self.dlu_management.get_biopsy_tracking_long() - query = "INSERT INTO biopsy_tracking_long(redcap_id, specimen_id, dlu_tis, dlu_packageType, status) VALUES(%s, %s, %s, %s)" + query = "INSERT INTO biopsy_tracking_long(redcap_id, specimen_id, dlu_tis, dlu_packageType, status) VALUES(%s, %s, %s, %s, %s)" records_modified = 0 for result in bt_results: self.db_tableau.insert_data(query, tuple(result.values())) From 7507692ec11d8af7b4ddc6afbf685e23ed28d299 Mon Sep 17 00:00:00 2001 From: zwright Date: Mon, 23 Mar 2026 13:48:16 -0400 Subject: [PATCH 53/54] KPMP-6566: try chunks --- data_management/services/dlu_filesystem.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/data_management/services/dlu_filesystem.py b/data_management/services/dlu_filesystem.py index f1b9d44..84b9305 100644 --- a/data_management/services/dlu_filesystem.py +++ b/data_management/services/dlu_filesystem.py @@ -15,15 +15,19 @@ def calculate_checksum(file_path: str): - if os.path.isdir(file_path): return "0" + if os.path.getsize(file_path) == 0: - # This is apparently the md5 returned for an empty file return 'd41d8cd98f00b204e9800998ecf8427e' - elif ".zarr" not in file_path: - with open(file_path) as f, mmap(f.fileno(), 0, access=ACCESS_READ) as f: - return md5(f).hexdigest() + + if ".zarr" not in file_path: + hash_md5 = hashlib.md5() + with open(file_path, "rb") as f: + # Read in 1MB chunks to keep RAM usage low + for chunk in iter(lambda: f.read(1024 * 1024), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() else: return compute_zarr_checksum(yield_files_local(file_path)).md5 From a4cef739fba1a4d39bc9a88035648497bff2c923 Mon Sep 17 00:00:00 2001 From: zwright Date: Wed, 25 Mar 2026 15:25:09 -0400 Subject: [PATCH 54/54] KPMP-6566: increase timeout --- data_management/Dockerfile | 2 +- data_management/services/dlu_filesystem.py | 14 +++++--------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/data_management/Dockerfile b/data_management/Dockerfile index 1fe48b1..cb6adef 100644 --- a/data_management/Dockerfile +++ b/data_management/Dockerfile @@ -31,5 +31,5 @@ COPY app.py ./ COPY process_bulk_uploads.py ./ COPY services/ ./services -ENTRYPOINT ["gunicorn", "-b", ":5000", "app:app", "-t", "600"] +ENTRYPOINT ["gunicorn", "-b", ":5000", "app:app", "-t", "1200"] diff --git a/data_management/services/dlu_filesystem.py b/data_management/services/dlu_filesystem.py index 84b9305..f1b9d44 100644 --- a/data_management/services/dlu_filesystem.py +++ b/data_management/services/dlu_filesystem.py @@ -15,19 +15,15 @@ def calculate_checksum(file_path: str): + if os.path.isdir(file_path): return "0" - if os.path.getsize(file_path) == 0: + # This is apparently the md5 returned for an empty file return 'd41d8cd98f00b204e9800998ecf8427e' - - if ".zarr" not in file_path: - hash_md5 = hashlib.md5() - with open(file_path, "rb") as f: - # Read in 1MB chunks to keep RAM usage low - for chunk in iter(lambda: f.read(1024 * 1024), b""): - hash_md5.update(chunk) - return hash_md5.hexdigest() + elif ".zarr" not in file_path: + with open(file_path) as f, mmap(f.fileno(), 0, access=ACCESS_READ) as f: + return md5(f).hexdigest() else: return compute_zarr_checksum(yield_files_local(file_path)).md5