Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,25 @@ we've got phonetic name matching too:
<State:Mississippi>
```

When the input is wrapped in extra words or punctuation, `clean_name` strips
it down to a bare state name before lookup:

```python
>>> us.states.clean_name(' The state OF idaho ')
'idaho'
>>> us.states.lookup(us.states.clean_name('Commonwealth of Virginia'))
<State:Virginia>
```

If the built-in matching comes up empty, pass a `fallback_func` to supply
your own match logic. The `startswith_fallback` helper matches on a state
name prefix:

```python
>>> us.states.lookup('verm', fallback_func=us.states.startswith_fallback)
<State:Vermont>
```


### Shapefiles

Expand Down
64 changes: 62 additions & 2 deletions us/states.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import re
from typing import Any, Dict, Iterable, List, Optional
from typing import Any, Callable, Dict, Iterable, List, Optional
from urllib.parse import urljoin

import jellyfish # type: ignore
Expand All @@ -13,6 +13,9 @@

_lookup_cache: Dict[str, "State"] = {}

# Words stripped by clean_name when normalizing free-form text to a state name.
_NAME_STOP_WORDS = frozenset({"the", "state", "commonwealth", "of"})


class County:
name: str
Expand Down Expand Up @@ -80,7 +83,50 @@ def shapefile_urls(self) -> Optional[Dict[str, str]]:
return urls


def lookup(val, field: Optional[str] = None, use_cache: bool = True) -> Optional[State]:
def clean_name(val: str) -> str:
"""Normalize free-form text down to a bare state name.

Removes punctuation and the filler words "the", "state", "commonwealth",
and "of", then collapses whitespace. For example, " The state OF idaho "
becomes "idaho" and "Commonwealth of Virginia" becomes "virginia".
"""

# drop punctuation, keeping word and whitespace boundaries intact
val = re.sub(r"[^\w\s]", " ", val.lower())

# tokenize (this also trims and collapses whitespace) and drop stop words
tokens = [t for t in val.split() if t not in _NAME_STOP_WORDS]

return " ".join(tokens)


def startswith_fallback(val: str) -> Optional[State]:
"""A `fallback_func` for `lookup` that matches on a state name prefix.

Returns the first state (or territory) whose name starts with `val`,
compared case-insensitively. Returns None when nothing matches. Note
that a short prefix may be ambiguous (e.g. "m" matches many states);
the first match wins. An empty string matches nothing.
"""

if not val:
return None

val = val.lower()

for state in STATES_AND_TERRITORIES:
if state.name.lower().startswith(val):
return state

return None


def lookup(
val,
field: Optional[str] = None,
use_cache: bool = True,
fallback_func: Optional[Callable[[Any], Optional[State]]] = None,
) -> Optional[State]:
"""Semi-fuzzy state lookup. This method will make a best effort
attempt at finding the state based on the lookup value provided.

Expand All @@ -97,10 +143,20 @@ def lookup(val, field: Optional[str] = None, use_cache: bool = True) -> Optional

This method caches non-None results, but can the cache can be bypassed
with the `use_cache=False` argument.

If no match is found and a `fallback_func` is provided, it is called
with the original, untransformed lookup value as its single argument
and its return value is used as the result. This lets callers supply
custom match logic (see `startswith_fallback`). Fallback results are
not cached.
"""

matched_state = None

# preserve the caller's original value for the fallback, since `val`
# may be reassigned below (e.g. to its metaphone)
original_val = val

if field is None:
if FIPS_RE.match(val):
field = "fips"
Expand All @@ -122,6 +178,10 @@ def lookup(val, field: Optional[str] = None, use_cache: bool = True) -> Optional
if use_cache:
_lookup_cache[cache_key] = state

# no match yet — give the caller-provided fallback a final attempt
if matched_state is None and fallback_func is not None:
matched_state = fallback_func(original_val)

return matched_state


Expand Down
78 changes: 78 additions & 0 deletions us/tests/test_us.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,84 @@ def test_obsolete_lookup():
assert us.states.lookup(state.name) is None


# clean_name


def test_clean_name_example():
assert us.states.clean_name(" The state OF idaho ") == "idaho"


def test_clean_name_removes_punctuation():
assert us.states.clean_name("Idaho!") == "idaho"
assert us.states.clean_name("North-Dakota") == "north dakota"


def test_clean_name_removes_stop_words():
assert us.states.clean_name("Commonwealth of Virginia") == "virginia"
assert us.states.clean_name("The State of New York") == "new york"


def test_clean_name_collapses_whitespace():
assert us.states.clean_name(" new york ") == "new york"


def test_clean_name_passthrough():
assert us.states.clean_name("Idaho") == "idaho"


# fallback_func


def test_lookup_fallback_used_when_no_match():
assert us.states.lookup("idah", fallback_func=us.states.startswith_fallback) == us.states.ID


def test_lookup_fallback_not_used_when_matched():
called = []

def spy(val):
called.append(val)
return None

assert us.states.lookup("Maryland", fallback_func=spy) == us.states.MD
assert called == []


def test_lookup_fallback_default_none():
assert us.states.lookup("notastate") is None


def test_lookup_fallback_receives_original_value():
received = []

def spy(val):
received.append(val)
return None

us.states.lookup("notastate", fallback_func=spy)
assert received == ["notastate"]


# startswith_fallback


def test_startswith_fallback_matches_prefix():
assert us.states.startswith_fallback("ida") == us.states.ID
assert us.states.startswith_fallback("IDA") == us.states.ID


def test_startswith_fallback_no_match():
assert us.states.startswith_fallback("zzz") is None


def test_startswith_fallback_empty_string():
assert us.states.startswith_fallback("") is None


def test_startswith_fallback_via_lookup():
assert us.states.lookup("Verm", fallback_func=us.states.startswith_fallback) == us.states.VT


# test metaphone


Expand Down