Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

* add counties, thanks to [Ray Kiddy](https://github.com/rkiddy)
* DC has returned to `STATES_AND_TERRITORIES`, thanks to [Kavi Gupta](https://github.com/kavigupta)
* add `clean_name()` method and `fallback_func` parameter to `lookup()` to provide customizable matching, thanks to [Max Filenko](https://github.com/mfilenko) and [Charlie Tonneslan](https://github.com/c-tonneslan)
* fix `py.typed` location, thanks to [johnw-bluemark](https://github.com/johnw-bluemark)
* add support for Python 3.13 and 3.14
* switch to [uv](https://docs.astral.sh/uv/) for development and packaging
Expand Down
46 changes: 46 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,52 @@ we've got phonetic name matching too:
```


### Cleaning up messy input

If your lookup values come from user input or other unpredictable sources,
`clean_name()` can strip them down to something the lookup is more likely to
match. It lowercases the input, removes punctuation, and drops the filler
words `the`, `commonwealth`, `state`, and `of`:

```python
>>> us.states.clean_name(' The state OF idaho ')
'idaho'
>>> us.states.clean_name('Commonwealth of Virginia')
'virginia'
>>> us.states.lookup(us.states.clean_name('The State of Maryland!'))
<State:Maryland>
```

`clean_name` is a standalone helper and `lookup` does not call it
automatically. Apply it yourself when you want it.


### Custom fallback matching

`lookup` accepts an optional `fallback_func` that is called when none of the
built-in matching strategies find a state. It receives the original lookup
value and should return a `State` or `None`:

```python
>>> def my_fallback(val):
... return us.states.AK if val == 'the big one' else None
>>> us.states.lookup('the big one', fallback_func=my_fallback)
<State:Alaska>
```

For the common case of matching against the start of a state name, the
included `startswith_fallback` helper does just that (case-insensitively):

```python
>>> us.states.lookup('calif', fallback_func=us.states.startswith_fallback)
<State:California>
```

Fallback results are cached separately per fallback function, so a cached
fallback match won't leak into lookups that pass a different fallback (or
no fallback at all).


### Shapefiles

You want shapefiles too? As long as you want 2010 shapefiles, we've gotcha covered.
Expand Down
84 changes: 78 additions & 6 deletions us/states.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import re
from typing import Any, Dict, Iterable, List, Optional
from typing import Any, Callable, Dict, Iterable, List, Optional
from urllib.parse import urljoin

import jellyfish # type: ignore
Expand Down Expand Up @@ -80,7 +80,12 @@ def shapefile_urls(self) -> Optional[Dict[str, str]]:
return urls


def lookup(val, field: Optional[str] = None, use_cache: bool = True) -> Optional[State]:
def lookup(
val,
field: Optional[str] = None,
use_cache: bool = True,
fallback_func: Optional[Callable[[Any], Optional[State]]] = None,
) -> Optional[State]:
"""Semi-fuzzy state lookup. This method will make a best effort
attempt at finding the state based on the lookup value provided.

Expand All @@ -95,10 +100,21 @@ def lookup(val, field: Optional[str] = None, use_cache: bool = True) -> Optional
the `field` argument. This skips the fuzzy-ish matching and does an
exact, case-sensitive comparison against the specified field.

This method caches non-None results, but can the cache can be bypassed
with the `use_cache=False` argument.
If no match is found and `fallback_func` is provided, it is called as
`fallback_func(val)` -- with the original, unmodified lookup value --
and should return a `State` or `None`. The fallback decides for itself
what to match against. See `startswith_fallback` for a ready-made
example.

This method caches non-None results, but the cache can be bypassed
with the `use_cache=False` argument. A cache hit returns immediately
without scanning the state list. Fallback matches are cached separately
per fallback function, so a cached fallback result is never returned to
a lookup that passes a different fallback or none at all.
"""

original_val = val

matched_state = None

if field is None:
Expand All @@ -111,20 +127,76 @@ def lookup(val, field: Optional[str] = None, use_cache: bool = True) -> Optional
val = jellyfish.metaphone(val)
field = "name_metaphone"

# see if result is in cache
cache_key = f"{field}:{val}"

# a normal-path cache hit short-circuits the rest of the method
if use_cache and cache_key in _lookup_cache:
matched_state = _lookup_cache[cache_key]
return _lookup_cache[cache_key]

# fallback results are cached under a separate, fallback-specific key so
# they never leak into non-fallback lookups -- or lookups using a
# different fallback -- of the same value
fallback_key = None
if fallback_func is not None:
fallback_key = f"{cache_key}:fallback:{fallback_func!r}"
if use_cache and fallback_key in _lookup_cache:
return _lookup_cache[fallback_key]

for state in STATES_AND_TERRITORIES:
if val == getattr(state, field):
matched_state = state
if use_cache:
_lookup_cache[cache_key] = state

if matched_state is None and fallback_func is not None:
matched_state = fallback_func(original_val)
if matched_state is not None and use_cache:
_lookup_cache[fallback_key] = matched_state

return matched_state


_CLEAN_NAME_STOPWORDS = {"the", "commonwealth", "state", "of"}


def clean_name(text: str) -> str:
"""Strip an incoming string down to a bare state name.

Removes punctuation and the filler words "the", "commonwealth",
"state", and "of", tokenizes on whitespace, and recombines the
remaining tokens into a single space-separated, lowercased string.

>>> clean_name(" The state OF idaho ")
'idaho'

This is a standalone helper; `lookup` does not call it automatically.
"""

# replace punctuation and underscores with spaces so adjacent words
# don't fuse together
depunctuated = re.sub(r"[\W_]", " ", text.lower())
tokens = [token for token in depunctuated.split() if token not in _CLEAN_NAME_STOPWORDS]
return " ".join(tokens)


def startswith_fallback(val: str) -> Optional[State]:
"""A `fallback_func` for `lookup` that matches `val` against the start
of each state's or territory's name, case-insensitively.

Returns the first state whose name starts with `val`, or `None` if
there is no match (including when `val` is empty).
"""

if not val:
return None

needle = val.lower()
for state in STATES_AND_TERRITORIES:
if state.name.lower().startswith(needle):
return state
return None


def mapping(from_field: str, to_field: str, states: Optional[Iterable[State]] = None) -> Dict[Any, Any]:
if states is None:
states = STATES_AND_TERRITORIES
Expand Down
92 changes: 92 additions & 0 deletions us/tests/test_us.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,98 @@ def test_obsolete_lookup():
assert us.states.lookup(state.name) is None


# clean_name


def test_clean_name():
assert us.states.clean_name(" The state OF idaho ") == "idaho"
assert us.states.clean_name("Idaho!") == "idaho"
assert us.states.clean_name("idaho") == "idaho"
assert us.states.clean_name("") == ""
assert us.states.clean_name("the state of") == ""
assert us.states.clean_name("New York") == "new york"
assert us.states.clean_name("new_york") == "new york"


# fallback_func / startswith_fallback


def test_startswith_fallback():
california = us.states.lookup("CA")
assert us.states.startswith_fallback("calif") == california
assert us.states.startswith_fallback("CALIF") == california
assert us.states.startswith_fallback("zzz") is None
assert us.states.startswith_fallback("") is None


def test_lookup_fallback_func():
california = us.states.lookup("CA")
idaho = us.states.lookup("ID")

# a garbage value that normally misses resolves via the fallback
assert us.states.lookup("calif", use_cache=False, fallback_func=us.states.startswith_fallback) == california

# without a fallback the same value still returns None
assert us.states.lookup("calif", use_cache=False) is None
assert us.states.lookup("calif", use_cache=False, fallback_func=None) is None

def boom(val):
raise AssertionError("fallback_func should not be called on a match")

# the fallback is not consulted when the normal scan matches
assert us.states.lookup("idaho", use_cache=False, fallback_func=boom) == idaho

# ...nor when a cache hit short-circuits the lookup
us.states.lookup("idaho") # prime the cache
assert us.states.lookup("idaho", fallback_func=boom) == idaho


def test_lookup_fallback_caching():
california = us.states.lookup("CA")

calls = []

def counting_fallback(val):
calls.append(val)
return us.states.startswith_fallback(val)

# a fallback hit is cached: the second call is served without re-invoking
assert us.states.lookup("califo", fallback_func=counting_fallback) == california
assert us.states.lookup("califo", fallback_func=counting_fallback) == california
assert calls == ["califo"]

# the cached fallback hit does NOT leak into a no-fallback lookup
assert us.states.lookup("califo") is None

# ...nor into a lookup using a different fallback
other_calls = []

def other_fallback(val):
other_calls.append(val)
return None

assert us.states.lookup("califo", fallback_func=other_fallback) is None
assert other_calls == ["califo"]

# use_cache=False neither reads nor writes the cache: the fallback runs
# on every call
calls.clear()
assert us.states.lookup("califo", use_cache=False, fallback_func=counting_fallback) == california
assert us.states.lookup("califo", use_cache=False, fallback_func=counting_fallback) == california
assert calls == ["califo", "califo"]


def test_lookup_cache_hit_short_circuit():
# poison the cache with a deliberately wrong answer; if the cache-hit
# short-circuit works, lookup returns it without scanning the state list
cache = us.states._lookup_cache
cache["abbr:MD"] = us.states.CA
try:
assert us.states.lookup("MD") == us.states.CA
finally:
cache.pop("abbr:MD", None)


# test metaphone


Expand Down