From c56cd8668384ff5c1efe11cbc4d3a34a2119f8ad Mon Sep 17 00:00:00 2001 From: Daniel Garcia Briseno Date: Wed, 13 May 2026 11:15:34 -0400 Subject: [PATCH 1/5] Add ability for downloaders to filter for specific files --- .../helioviewer/hvpull/browser/httpbrowser.py | 22 +++++++++++-------- install/helioviewer/hvpull/net/daemon.py | 2 +- .../helioviewer/hvpull/servers/__init__.py | 8 +++++++ install/helioviewer/hvpull/servers/punch.py | 6 +++++ 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/install/helioviewer/hvpull/browser/httpbrowser.py b/install/helioviewer/hvpull/browser/httpbrowser.py index d235f3476..88b3df308 100644 --- a/install/helioviewer/hvpull/browser/httpbrowser.py +++ b/install/helioviewer/hvpull/browser/httpbrowser.py @@ -22,9 +22,9 @@ def read(self, uri): usock = urllib.request.urlopen(uri) self.feed(usock.read().decode(usock.headers.get_content_charset())) usock.close() - + return self.urls - + def reset(self): """Reset state of URLLister""" HTMLParser.reset(self) @@ -58,7 +58,7 @@ def read(self, uri): print (e) return self.urls - + def reset(self): """Reset state of URLLister""" SGMLParser.reset(self) @@ -73,7 +73,7 @@ class HTTPDataBrowser(BaseDataBrowser): def __init__(self, server): BaseDataBrowser.__init__(self, server) socket.setdefaulttimeout(60) - + def get_directories(self, start_date, end_date): """Generates a list of remote directories which may be queried for files corresponding to the requested range. Note that these @@ -81,17 +81,21 @@ def get_directories(self, start_date, end_date): # filter(lambda url: url.endswith("/"), self._query(location)) return self.server.compute_directories(start_date, end_date) - def get_files(self, location, extension): + def get_files(self, location, extension, filter_func: callable | None = None): """Get all the files that end with specified extension at the uri""" files = None num_retries = 0 - + # Get a list of the files at the remote location, if it exists # To avoid spending too much time, we will timeout after a short time # and retry up to 10 times. while files is None and num_retries <= 10: try: + # Only grab files with the matching file extension files = filter(lambda url: url.endswith("." + extension), self._query(location)) + # If there is a user-defined filter function, use that to only get those specific files. + if filter_func is not None: + files = filter(filter_func, files) except IOError as e: if isinstance(e.strerror, socket.error): # if server is unreachable, raise an exception @@ -105,10 +109,10 @@ def get_files(self, location, extension): files = [] return files - + def _query(self, location): """Get a list of files and folders at the specified remote location""" - # query the remote location for the list of files and subdirectories + # query the remote location for the list of files and subdirectories if (sys.version_info >= (3, 0)): url_lister = URLLister() @@ -121,4 +125,4 @@ def _query(self, location): urls = filter(lambda url: url[0] != "/" and url[0] != "?", result) return [os.path.join(location, url) for url in urls] - + diff --git a/install/helioviewer/hvpull/net/daemon.py b/install/helioviewer/hvpull/net/daemon.py index f7504bd73..fddc3a3c7 100644 --- a/install/helioviewer/hvpull/net/daemon.py +++ b/install/helioviewer/hvpull/net/daemon.py @@ -421,7 +421,7 @@ def query_server(self, browser, starttime, endtime): return [] try: - matches = browser.get_files(directory, "jp2") + matches = browser.get_files(directory, "jp2", browser.server.filter) files.extend(matches) except NetworkError: diff --git a/install/helioviewer/hvpull/servers/__init__.py b/install/helioviewer/hvpull/servers/__init__.py index 5c871dd50..511256b1c 100644 --- a/install/helioviewer/hvpull/servers/__init__.py +++ b/install/helioviewer/hvpull/servers/__init__.py @@ -68,6 +68,14 @@ def get_dates(self, starttime, endtime): return dates + def filter(self, file: str) -> bool: + """ + Returns True if the file should be downloaded, otherwise False. + This may be overridden by specific Data Servers to only download + specific files from the upstream directory + """ + return True + def get_file_regex(self): """Returns a regex which described the expected format of filenames on the server""" diff --git a/install/helioviewer/hvpull/servers/punch.py b/install/helioviewer/hvpull/servers/punch.py index 7be5e473e..c9d48f550 100644 --- a/install/helioviewer/hvpull/servers/punch.py +++ b/install/helioviewer/hvpull/servers/punch.py @@ -24,3 +24,9 @@ def get_datetime_from_file(self, filename): fname = os.path.basename(filename) datestr = fname[13:27] return datetime.datetime.strptime(datestr, '%Y%m%d%H%M%S') + + def filter(self, filename: str) -> bool: + """ + PUNCH should only download the v0k files at this time. + """ + return "v0k" in filename From 2ab31616f9ea9bb6c595aaa69e01d5b7f07adec2 Mon Sep 17 00:00:00 2001 From: Daniel Garcia Briseno Date: Wed, 13 May 2026 11:32:01 -0400 Subject: [PATCH 2/5] Update downloader to re-scan --- install/helioviewer/hvpull/net/daemon.py | 3 ++- install/helioviewer/hvpull/servers/__init__.py | 12 +----------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/install/helioviewer/hvpull/net/daemon.py b/install/helioviewer/hvpull/net/daemon.py index fddc3a3c7..88625eb52 100644 --- a/install/helioviewer/hvpull/net/daemon.py +++ b/install/helioviewer/hvpull/net/daemon.py @@ -174,7 +174,7 @@ def start(self, starttime=None, endtime=None, backfill=None): # get a list of files available # self.oldest_timestamp gets set by query() during the first run # before the main loop. - self.query(starttime, now) + self.query(self.oldest_timestamp, now) self.sleep() @@ -241,6 +241,7 @@ def query(self, starttime, endtime): try: # Filter by time range filtered = self._filter_files_by_time(url_list, starttime, endtime) + # Filter to only download new files that have not already been downloaded previously. filtered = list(filter(self._filter_new, filtered)) except mysqld.OperationalError: # MySQL has gone away -- try again in 5s diff --git a/install/helioviewer/hvpull/servers/__init__.py b/install/helioviewer/hvpull/servers/__init__.py index 511256b1c..8a099e2a7 100644 --- a/install/helioviewer/hvpull/servers/__init__.py +++ b/install/helioviewer/hvpull/servers/__init__.py @@ -76,11 +76,6 @@ def filter(self, file: str) -> bool: """ return True - def get_file_regex(self): - """Returns a regex which described the expected format of filenames on - the server""" - return self.filename_regex - def get_measurements(self, nicknames, dates): """Get a list of all the URIs down to the measurement""" return None @@ -93,7 +88,7 @@ def get_datetime_from_file(self, filename): return get_datetime_from_file(filename) -class DataServerPauseDelayDefinesDefaultStartTime: +class DataServerPauseDelayDefinesDefaultStartTime(DataServer): """Class for interacting with data servers. In this class the pause defines the default start time. If real time is UTC, then the default start time is UTC - pause minutes.""" @@ -134,11 +129,6 @@ def get_dates(self, starttime, endtime): return dates - def get_file_regex(self): - """Returns a regex which described the expected format of filenames on - the server""" - return self.filename_regex - def get_measurements(self, nicknames, dates): """Get a list of all the URIs down to the measurement""" return None From 5c57b9cee743fe796265beb279d7d5c20284f106 Mon Sep 17 00:00:00 2001 From: Daniel Garcia Briseno Date: Wed, 13 May 2026 11:39:44 -0400 Subject: [PATCH 3/5] Add starttime/endtime validation --- install/helioviewer/hvpull/net/daemon.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/install/helioviewer/hvpull/net/daemon.py b/install/helioviewer/hvpull/net/daemon.py index 88625eb52..a6d0f8543 100644 --- a/install/helioviewer/hvpull/net/daemon.py +++ b/install/helioviewer/hvpull/net/daemon.py @@ -201,6 +201,8 @@ def query(self, starttime, endtime): if any new files have appeared since the first execution. This continues until no new files are found (for xxx minutes?) """ + if (starttime > endtime): + raise ValueError(f"Start Time {starttime} is ahead of End Time {endtime}. No files would be downloaded.") urls = [] fmt = '%Y-%m-%d %H:%M:%S' From c8a1de769168cf386d8a5ff80193f4d191f7dfeb Mon Sep 17 00:00:00 2001 From: Daniel Garcia Briseno Date: Wed, 13 May 2026 12:01:09 -0400 Subject: [PATCH 4/5] don't let query magic make starttime go earlier than requested --- install/helioviewer/hvpull/net/daemon.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/install/helioviewer/hvpull/net/daemon.py b/install/helioviewer/hvpull/net/daemon.py index a6d0f8543..336f71f0d 100644 --- a/install/helioviewer/hvpull/net/daemon.py +++ b/install/helioviewer/hvpull/net/daemon.py @@ -325,11 +325,13 @@ def query(self, starttime, endtime): if self.servers[0].name in ['LMSAL2']: new_urls.append(extra_filtered) if len(extra_filtered) > 0: - self.oldest_timestamp = self._get_oldest_image(extra_filtered) + # Using max(starttime, ...) so oldest_timestamp never goes earlier than the initial requested starttime + self.oldest_timestamp = max(starttime, self._get_oldest_image(extra_filtered)) else: new_urls.append(filtered) if len(filtered) > 0: - self.oldest_timestamp = self._get_oldest_image(filtered) + # Using max(starttime, ...) so oldest_timestamp never goes earlier than the initial requested starttime + self.oldest_timestamp = max(starttime, self._get_oldest_image(filtered)) # check disk space if not self.sent_diskspace_warning: From d95a56ef3a1a1668b25a6362adb3deb8111f2439 Mon Sep 17 00:00:00 2001 From: Daniel Garcia Briseno Date: Wed, 13 May 2026 12:06:10 -0400 Subject: [PATCH 5/5] don't exclusively pick up v0k --- install/helioviewer/hvpull/servers/punch.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/install/helioviewer/hvpull/servers/punch.py b/install/helioviewer/hvpull/servers/punch.py index c9d48f550..7be5e473e 100644 --- a/install/helioviewer/hvpull/servers/punch.py +++ b/install/helioviewer/hvpull/servers/punch.py @@ -24,9 +24,3 @@ def get_datetime_from_file(self, filename): fname = os.path.basename(filename) datestr = fname[13:27] return datetime.datetime.strptime(datestr, '%Y%m%d%H%M%S') - - def filter(self, filename: str) -> bool: - """ - PUNCH should only download the v0k files at this time. - """ - return "v0k" in filename