diff --git a/README.md b/README.md index 53a93a1..65b0a55 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,25 @@ we've got phonetic name matching too: ``` +When the input is wrapped in extra words or punctuation, `clean_name` strips +it down to a bare state name before lookup: + +```python +>>> us.states.clean_name(' The state OF idaho ') +'idaho' +>>> us.states.lookup(us.states.clean_name('Commonwealth of Virginia')) + +``` + +If the built-in matching comes up empty, pass a `fallback_func` to supply +your own match logic. The `startswith_fallback` helper matches on a state +name prefix: + +```python +>>> us.states.lookup('verm', fallback_func=us.states.startswith_fallback) + +``` + ### Shapefiles diff --git a/us/states.py b/us/states.py index 59d9b5a..ef1d7e7 100644 --- a/us/states.py +++ b/us/states.py @@ -1,6 +1,6 @@ import os import re -from typing import Any, Dict, Iterable, List, Optional +from typing import Any, Callable, Dict, Iterable, List, Optional from urllib.parse import urljoin import jellyfish # type: ignore @@ -13,6 +13,9 @@ _lookup_cache: Dict[str, "State"] = {} +# Words stripped by clean_name when normalizing free-form text to a state name. +_NAME_STOP_WORDS = frozenset({"the", "state", "commonwealth", "of"}) + class County: name: str @@ -80,7 +83,50 @@ def shapefile_urls(self) -> Optional[Dict[str, str]]: return urls -def lookup(val, field: Optional[str] = None, use_cache: bool = True) -> Optional[State]: +def clean_name(val: str) -> str: + """Normalize free-form text down to a bare state name. + + Removes punctuation and the filler words "the", "state", "commonwealth", + and "of", then collapses whitespace. For example, " The state OF idaho " + becomes "idaho" and "Commonwealth of Virginia" becomes "virginia". + """ + + # drop punctuation, keeping word and whitespace boundaries intact + val = re.sub(r"[^\w\s]", " ", val.lower()) + + # tokenize (this also trims and collapses whitespace) and drop stop words + tokens = [t for t in val.split() if t not in _NAME_STOP_WORDS] + + return " ".join(tokens) + + +def startswith_fallback(val: str) -> Optional[State]: + """A `fallback_func` for `lookup` that matches on a state name prefix. + + Returns the first state (or territory) whose name starts with `val`, + compared case-insensitively. Returns None when nothing matches. Note + that a short prefix may be ambiguous (e.g. "m" matches many states); + the first match wins. An empty string matches nothing. + """ + + if not val: + return None + + val = val.lower() + + for state in STATES_AND_TERRITORIES: + if state.name.lower().startswith(val): + return state + + return None + + +def lookup( + val, + field: Optional[str] = None, + use_cache: bool = True, + fallback_func: Optional[Callable[[Any], Optional[State]]] = None, +) -> Optional[State]: """Semi-fuzzy state lookup. This method will make a best effort attempt at finding the state based on the lookup value provided. @@ -97,10 +143,20 @@ def lookup(val, field: Optional[str] = None, use_cache: bool = True) -> Optional This method caches non-None results, but can the cache can be bypassed with the `use_cache=False` argument. + + If no match is found and a `fallback_func` is provided, it is called + with the original, untransformed lookup value as its single argument + and its return value is used as the result. This lets callers supply + custom match logic (see `startswith_fallback`). Fallback results are + not cached. """ matched_state = None + # preserve the caller's original value for the fallback, since `val` + # may be reassigned below (e.g. to its metaphone) + original_val = val + if field is None: if FIPS_RE.match(val): field = "fips" @@ -122,6 +178,10 @@ def lookup(val, field: Optional[str] = None, use_cache: bool = True) -> Optional if use_cache: _lookup_cache[cache_key] = state + # no match yet — give the caller-provided fallback a final attempt + if matched_state is None and fallback_func is not None: + matched_state = fallback_func(original_val) + return matched_state diff --git a/us/tests/test_us.py b/us/tests/test_us.py index ab667b4..c062ff1 100644 --- a/us/tests/test_us.py +++ b/us/tests/test_us.py @@ -74,6 +74,84 @@ def test_obsolete_lookup(): assert us.states.lookup(state.name) is None +# clean_name + + +def test_clean_name_example(): + assert us.states.clean_name(" The state OF idaho ") == "idaho" + + +def test_clean_name_removes_punctuation(): + assert us.states.clean_name("Idaho!") == "idaho" + assert us.states.clean_name("North-Dakota") == "north dakota" + + +def test_clean_name_removes_stop_words(): + assert us.states.clean_name("Commonwealth of Virginia") == "virginia" + assert us.states.clean_name("The State of New York") == "new york" + + +def test_clean_name_collapses_whitespace(): + assert us.states.clean_name(" new york ") == "new york" + + +def test_clean_name_passthrough(): + assert us.states.clean_name("Idaho") == "idaho" + + +# fallback_func + + +def test_lookup_fallback_used_when_no_match(): + assert us.states.lookup("idah", fallback_func=us.states.startswith_fallback) == us.states.ID + + +def test_lookup_fallback_not_used_when_matched(): + called = [] + + def spy(val): + called.append(val) + return None + + assert us.states.lookup("Maryland", fallback_func=spy) == us.states.MD + assert called == [] + + +def test_lookup_fallback_default_none(): + assert us.states.lookup("notastate") is None + + +def test_lookup_fallback_receives_original_value(): + received = [] + + def spy(val): + received.append(val) + return None + + us.states.lookup("notastate", fallback_func=spy) + assert received == ["notastate"] + + +# startswith_fallback + + +def test_startswith_fallback_matches_prefix(): + assert us.states.startswith_fallback("ida") == us.states.ID + assert us.states.startswith_fallback("IDA") == us.states.ID + + +def test_startswith_fallback_no_match(): + assert us.states.startswith_fallback("zzz") is None + + +def test_startswith_fallback_empty_string(): + assert us.states.startswith_fallback("") is None + + +def test_startswith_fallback_via_lookup(): + assert us.states.lookup("Verm", fallback_func=us.states.startswith_fallback) == us.states.VT + + # test metaphone