diff --git a/CHANGELOG.md b/CHANGELOG.md index 95946fe..c2c71f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ * add counties, thanks to [Ray Kiddy](https://github.com/rkiddy) * DC has returned to `STATES_AND_TERRITORIES`, thanks to [Kavi Gupta](https://github.com/kavigupta) +* add `clean_name()` method and `fallback_func` parameter to `lookup()` to provide customizable matching, thanks to [Max Filenko](https://github.com/mfilenko) and [Charlie Tonneslan](https://github.com/c-tonneslan) * fix `py.typed` location, thanks to [johnw-bluemark](https://github.com/johnw-bluemark) * add support for Python 3.13 and 3.14 * switch to [uv](https://docs.astral.sh/uv/) for development and packaging diff --git a/README.md b/README.md index 53a93a1..e9dc2aa 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,52 @@ we've got phonetic name matching too: ``` +### Cleaning up messy input + +If your lookup values come from user input or other unpredictable sources, +`clean_name()` can strip them down to something the lookup is more likely to +match. It lowercases the input, removes punctuation, and drops the filler +words `the`, `commonwealth`, `state`, and `of`: + +```python +>>> us.states.clean_name(' The state OF idaho ') +'idaho' +>>> us.states.clean_name('Commonwealth of Virginia') +'virginia' +>>> us.states.lookup(us.states.clean_name('The State of Maryland!')) + +``` + +`clean_name` is a standalone helper and `lookup` does not call it +automatically. Apply it yourself when you want it. + + +### Custom fallback matching + +`lookup` accepts an optional `fallback_func` that is called when none of the +built-in matching strategies find a state. It receives the original lookup +value and should return a `State` or `None`: + +```python +>>> def my_fallback(val): +... return us.states.AK if val == 'the big one' else None +>>> us.states.lookup('the big one', fallback_func=my_fallback) + +``` + +For the common case of matching against the start of a state name, the +included `startswith_fallback` helper does just that (case-insensitively): + +```python +>>> us.states.lookup('calif', fallback_func=us.states.startswith_fallback) + +``` + +Fallback results are cached separately per fallback function, so a cached +fallback match won't leak into lookups that pass a different fallback (or +no fallback at all). + + ### Shapefiles You want shapefiles too? As long as you want 2010 shapefiles, we've gotcha covered. diff --git a/us/states.py b/us/states.py index 59d9b5a..9ef8c5a 100644 --- a/us/states.py +++ b/us/states.py @@ -1,6 +1,6 @@ import os import re -from typing import Any, Dict, Iterable, List, Optional +from typing import Any, Callable, Dict, Iterable, List, Optional from urllib.parse import urljoin import jellyfish # type: ignore @@ -80,7 +80,12 @@ def shapefile_urls(self) -> Optional[Dict[str, str]]: return urls -def lookup(val, field: Optional[str] = None, use_cache: bool = True) -> Optional[State]: +def lookup( + val, + field: Optional[str] = None, + use_cache: bool = True, + fallback_func: Optional[Callable[[Any], Optional[State]]] = None, +) -> Optional[State]: """Semi-fuzzy state lookup. This method will make a best effort attempt at finding the state based on the lookup value provided. @@ -95,10 +100,21 @@ def lookup(val, field: Optional[str] = None, use_cache: bool = True) -> Optional the `field` argument. This skips the fuzzy-ish matching and does an exact, case-sensitive comparison against the specified field. - This method caches non-None results, but can the cache can be bypassed - with the `use_cache=False` argument. + If no match is found and `fallback_func` is provided, it is called as + `fallback_func(val)` -- with the original, unmodified lookup value -- + and should return a `State` or `None`. The fallback decides for itself + what to match against. See `startswith_fallback` for a ready-made + example. + + This method caches non-None results, but the cache can be bypassed + with the `use_cache=False` argument. A cache hit returns immediately + without scanning the state list. Fallback matches are cached separately + per fallback function, so a cached fallback result is never returned to + a lookup that passes a different fallback or none at all. """ + original_val = val + matched_state = None if field is None: @@ -111,10 +127,20 @@ def lookup(val, field: Optional[str] = None, use_cache: bool = True) -> Optional val = jellyfish.metaphone(val) field = "name_metaphone" - # see if result is in cache cache_key = f"{field}:{val}" + + # a normal-path cache hit short-circuits the rest of the method if use_cache and cache_key in _lookup_cache: - matched_state = _lookup_cache[cache_key] + return _lookup_cache[cache_key] + + # fallback results are cached under a separate, fallback-specific key so + # they never leak into non-fallback lookups -- or lookups using a + # different fallback -- of the same value + fallback_key = None + if fallback_func is not None: + fallback_key = f"{cache_key}:fallback:{fallback_func!r}" + if use_cache and fallback_key in _lookup_cache: + return _lookup_cache[fallback_key] for state in STATES_AND_TERRITORIES: if val == getattr(state, field): @@ -122,9 +148,55 @@ def lookup(val, field: Optional[str] = None, use_cache: bool = True) -> Optional if use_cache: _lookup_cache[cache_key] = state + if matched_state is None and fallback_func is not None: + matched_state = fallback_func(original_val) + if matched_state is not None and use_cache: + _lookup_cache[fallback_key] = matched_state + return matched_state +_CLEAN_NAME_STOPWORDS = {"the", "commonwealth", "state", "of"} + + +def clean_name(text: str) -> str: + """Strip an incoming string down to a bare state name. + + Removes punctuation and the filler words "the", "commonwealth", + "state", and "of", tokenizes on whitespace, and recombines the + remaining tokens into a single space-separated, lowercased string. + + >>> clean_name(" The state OF idaho ") + 'idaho' + + This is a standalone helper; `lookup` does not call it automatically. + """ + + # replace punctuation and underscores with spaces so adjacent words + # don't fuse together + depunctuated = re.sub(r"[\W_]", " ", text.lower()) + tokens = [token for token in depunctuated.split() if token not in _CLEAN_NAME_STOPWORDS] + return " ".join(tokens) + + +def startswith_fallback(val: str) -> Optional[State]: + """A `fallback_func` for `lookup` that matches `val` against the start + of each state's or territory's name, case-insensitively. + + Returns the first state whose name starts with `val`, or `None` if + there is no match (including when `val` is empty). + """ + + if not val: + return None + + needle = val.lower() + for state in STATES_AND_TERRITORIES: + if state.name.lower().startswith(needle): + return state + return None + + def mapping(from_field: str, to_field: str, states: Optional[Iterable[State]] = None) -> Dict[Any, Any]: if states is None: states = STATES_AND_TERRITORIES diff --git a/us/tests/test_us.py b/us/tests/test_us.py index ab667b4..f156367 100644 --- a/us/tests/test_us.py +++ b/us/tests/test_us.py @@ -74,6 +74,98 @@ def test_obsolete_lookup(): assert us.states.lookup(state.name) is None +# clean_name + + +def test_clean_name(): + assert us.states.clean_name(" The state OF idaho ") == "idaho" + assert us.states.clean_name("Idaho!") == "idaho" + assert us.states.clean_name("idaho") == "idaho" + assert us.states.clean_name("") == "" + assert us.states.clean_name("the state of") == "" + assert us.states.clean_name("New York") == "new york" + assert us.states.clean_name("new_york") == "new york" + + +# fallback_func / startswith_fallback + + +def test_startswith_fallback(): + california = us.states.lookup("CA") + assert us.states.startswith_fallback("calif") == california + assert us.states.startswith_fallback("CALIF") == california + assert us.states.startswith_fallback("zzz") is None + assert us.states.startswith_fallback("") is None + + +def test_lookup_fallback_func(): + california = us.states.lookup("CA") + idaho = us.states.lookup("ID") + + # a garbage value that normally misses resolves via the fallback + assert us.states.lookup("calif", use_cache=False, fallback_func=us.states.startswith_fallback) == california + + # without a fallback the same value still returns None + assert us.states.lookup("calif", use_cache=False) is None + assert us.states.lookup("calif", use_cache=False, fallback_func=None) is None + + def boom(val): + raise AssertionError("fallback_func should not be called on a match") + + # the fallback is not consulted when the normal scan matches + assert us.states.lookup("idaho", use_cache=False, fallback_func=boom) == idaho + + # ...nor when a cache hit short-circuits the lookup + us.states.lookup("idaho") # prime the cache + assert us.states.lookup("idaho", fallback_func=boom) == idaho + + +def test_lookup_fallback_caching(): + california = us.states.lookup("CA") + + calls = [] + + def counting_fallback(val): + calls.append(val) + return us.states.startswith_fallback(val) + + # a fallback hit is cached: the second call is served without re-invoking + assert us.states.lookup("califo", fallback_func=counting_fallback) == california + assert us.states.lookup("califo", fallback_func=counting_fallback) == california + assert calls == ["califo"] + + # the cached fallback hit does NOT leak into a no-fallback lookup + assert us.states.lookup("califo") is None + + # ...nor into a lookup using a different fallback + other_calls = [] + + def other_fallback(val): + other_calls.append(val) + return None + + assert us.states.lookup("califo", fallback_func=other_fallback) is None + assert other_calls == ["califo"] + + # use_cache=False neither reads nor writes the cache: the fallback runs + # on every call + calls.clear() + assert us.states.lookup("califo", use_cache=False, fallback_func=counting_fallback) == california + assert us.states.lookup("califo", use_cache=False, fallback_func=counting_fallback) == california + assert calls == ["califo", "califo"] + + +def test_lookup_cache_hit_short_circuit(): + # poison the cache with a deliberately wrong answer; if the cache-hit + # short-circuit works, lookup returns it without scanning the state list + cache = us.states._lookup_cache + cache["abbr:MD"] = us.states.CA + try: + assert us.states.lookup("MD") == us.states.CA + finally: + cache.pop("abbr:MD", None) + + # test metaphone