Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,35 @@ my_feature_builder = FeatureBuilder(
my_feature_builder.featurize()
```

If you already generate embeddings elsewhere in your pipeline, you can supply your own encoder instead of using the default sentence-transformers model:

```python
import numpy as np
from openai import OpenAI

client = OpenAI()

def openai_encoder(texts):
response = client.embeddings.create(
model="text-embedding-3-small",
input=texts,
)
return np.array([item.embedding for item in response.data])

my_feature_builder = FeatureBuilder(
input_df = my_pandas_dataframe,
conversation_id_col = "conversation_id",
speaker_id_col = "speaker_id",
message_col = "message",
vector_directory = "./vector_data/",
embedding_fn = openai_encoder,
embedding_backend_id = "openai-text-embedding-3-small",
embedding_dim = 1536,
)
```

When a custom `embedding_fn` is provided, the package keeps the vector cache separate for that backend and does not initialize the default sentence-transformers model unless it is actually needed.

### Data Format
We accept input data in the format of a Pandas DataFrame. Your data needs to have three (3) required input columns and one optional column.

Expand Down
37 changes: 34 additions & 3 deletions src/team_comm_tools/feature_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import time
import itertools
import warnings
from collections.abc import Callable

# Imports from feature files and classes
from team_comm_tools.utils.download_resources import download
Expand Down Expand Up @@ -90,6 +91,15 @@ class FeatureBuilder:
:type ner_cutoff: int
:param regenerate_vectors: If true, regenerates vector data even if it already exists. Defaults to False.
:type regenerate_vectors: bool, optional
:param embedding_fn: Optional callable that maps a list of messages to a 2D embedding array.
Defaults to None, which preserves the built-in sentence-transformers backend.
:type embedding_fn: Callable[[list[str]], np.ndarray] | None, optional
:param embedding_backend_id: Optional identifier used to keep custom vector caches distinct across
embedding backends. Ignored when `embedding_fn` is None.
:type embedding_backend_id: str | None, optional
:param embedding_dim: Optional embedding dimension for validating custom encoder output and stabilizing
custom vector cache keys.
:type embedding_dim: int | None, optional
:param compute_vectors_from_preprocessed: If true, computes vectors using preprocessed text (with
capitalization and punctuation removed). Defaults to False.
:type compute_vectors_from_preprocessed: bool, optional
Expand Down Expand Up @@ -137,6 +147,9 @@ def __init__(
ner_training_df: pd.DataFrame = None,
ner_cutoff: int = 0.9,
regenerate_vectors: bool = False,
embedding_fn: Callable[[list[str]], np.ndarray] | None = None,
embedding_backend_id: str | None = None,
embedding_dim: int | None = None,
compute_vectors_from_preprocessed: bool = False,
custom_liwc_dictionary_path: str = '',
convo_aggregation = True,
Expand Down Expand Up @@ -174,6 +187,9 @@ def __init__(
self.within_task = within_task
self.ner_cutoff = ner_cutoff
self.regenerate_vectors = regenerate_vectors
self.embedding_fn = embedding_fn
self.embedding_backend_id = embedding_backend_id
self.embedding_dim = embedding_dim
self.convo_aggregation = convo_aggregation
self.convo_methods = convo_methods
self.convo_columns = convo_columns
Expand Down Expand Up @@ -389,10 +405,25 @@ def __init__(
self.output_file_path_user_level = re.sub(r'/user/', r'/output/user/', self.output_file_path_user_level)

# Logic for processing vector cache
self.vect_path = vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name
self.vect_path = build_vector_cache_path(
vector_directory + "sentence/" + ("turns" if self.turns else "chats") + "/" + base_file_name,
embedding_fn=self.embedding_fn,
embedding_backend_id=self.embedding_backend_id,
embedding_dim=self.embedding_dim,
)
self.bert_path = vector_directory + "sentiment/" + ("turns" if self.turns else "chats") + "/" + base_file_name

check_embeddings(self.chat_data, self.vect_path, self.bert_path, need_sentence, need_sentiment, self.regenerate_vectors, message_col = self.vector_colname)
check_embeddings(
self.chat_data,
self.vect_path,
self.bert_path,
need_sentence,
need_sentiment,
self.regenerate_vectors,
message_col=self.vector_colname,
embedding_fn=self.embedding_fn,
embedding_dim=self.embedding_dim,
)

if(need_sentence):
self.vect_data = pd.read_csv(self.vect_path, encoding='mac_roman')
Expand Down Expand Up @@ -781,4 +812,4 @@ def verify_timestamp_format(self, timestamp_col) -> None:
raise ValueError(
f"Column '{timestamp_col}' contains values that are neither parseable as datetime "
f"nor convertible to numeric format."
)
)
13 changes: 10 additions & 3 deletions src/team_comm_tools/features/within_person_discursive_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,14 @@
import warnings
warnings.filterwarnings('ignore') # We get empty slice warnings for short conversations

def get_nan_vector():
def get_nan_vector(chat_data=None):
if chat_data is not None:
for value in chat_data["message_embedding"]:
if isinstance(value, np.ndarray):
return np.zeros(value.shape, dtype=float)
if isinstance(value, (list, tuple)):
return np.zeros(len(value), dtype=float)

current_dir = os.path.dirname(__file__)
nan_vector_file_path = os.path.join(current_dir, './assets/nan_vector.txt')
nan_vector_file_path = os.path.abspath(nan_vector_file_path)
Expand Down Expand Up @@ -35,13 +42,13 @@ def get_nan_vector():
def get_within_person_disc_range(chat_data, num_chunks, conversation_id_col, speaker_id_col):

# Get nan vector
nan_vector = get_nan_vector()
nan_vector = get_nan_vector(chat_data)

#calculate mean vector per speaker per chunk
mean_vec_speaker_chunks = pd.DataFrame(chat_data.groupby([conversation_id_col, speaker_id_col, 'chunk_num']).message_embedding.apply(np.mean)).unstack('chunk_num').rename(columns={'message_embedding': 'mean_chunk_vec'})

#collapse multi-index
mean_vec_speaker_chunks.columns = ["_c".join(col).strip() for col in mean_vec_speaker_chunks.columns.values]
mean_vec_speaker_chunks.columns = ["_c".join(str(part) for part in col).strip() for col in mean_vec_speaker_chunks.columns.values]

actual_num_chunks = len(mean_vec_speaker_chunks[2:].columns) # omit the first two, which is conversation_num and speaker_nickname

Expand Down
137 changes: 124 additions & 13 deletions src/team_comm_tools/utils/check_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import pickle
import warnings
import hashlib
from tqdm import tqdm
from pathlib import Path

Expand All @@ -17,18 +18,93 @@

logging.set_verbosity(40) # only log errors

model_vect = SentenceTransformer('all-MiniLM-L6-v2')
DEFAULT_VECTOR_MODEL_NAME = 'all-MiniLM-L6-v2'
model_vect = None
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model_bert = AutoModelForSequenceClassification.from_pretrained(MODEL)
tokenizer = None
model_bert = None
os.environ["TOKENIZERS_PARALLELISM"] = "false"
EMOJIS_TO_PRESERVE = {
"(:", "(;", "):", "/:", ":(", ":)", ":/", ";)"
}


def get_vector_model():
global model_vect
if model_vect is None:
model_vect = SentenceTransformer(DEFAULT_VECTOR_MODEL_NAME)
return model_vect


def get_sentiment_model():
global tokenizer, model_bert
if tokenizer is None:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
if model_bert is None:
model_bert = AutoModelForSequenceClassification.from_pretrained(MODEL)
return tokenizer, model_bert


def default_vector_encoder(texts):
return get_vector_model().encode(texts)


def get_embedding_cache_suffix(embedding_fn=None, embedding_backend_id=None, embedding_dim=None):
if embedding_fn is None:
return ""

raw_backend_id = embedding_backend_id
if not raw_backend_id:
callable_name = getattr(embedding_fn, "__qualname__", type(embedding_fn).__qualname__)
raw_backend_id = f"{embedding_fn.__module__}.{callable_name}"
if embedding_dim is not None:
raw_backend_id = f"{raw_backend_id}-dim{embedding_dim}"

safe_backend_id = re.sub(r"[^A-Za-z0-9]+", "-", raw_backend_id).strip("-").lower()[:48] or "custom"
digest = hashlib.sha1(raw_backend_id.encode("utf-8")).hexdigest()[:12]
return f"__{safe_backend_id}-{digest}"


def build_vector_cache_path(vect_path: str, embedding_fn=None, embedding_backend_id=None, embedding_dim=None) -> str:
suffix = get_embedding_cache_suffix(
embedding_fn=embedding_fn,
embedding_backend_id=embedding_backend_id,
embedding_dim=embedding_dim,
)
if not suffix:
return vect_path

path = Path(vect_path)
return str(path.with_name(f"{path.stem}{suffix}{path.suffix}"))


def validate_embeddings(embeddings, expected_rows, embedding_dim=None):
embeddings = np.asarray(embeddings, dtype=float)

if embeddings.ndim == 1:
if expected_rows != 1:
raise ValueError("Custom embedding_fn must return one embedding per input string.")
embeddings = embeddings.reshape(1, -1)

if embeddings.ndim != 2:
raise ValueError("Custom embedding_fn must return a 2D array-like object.")

if embeddings.shape[0] != expected_rows:
raise ValueError(
"Custom embedding_fn must return the same number of embeddings as the number of input strings."
)

if embedding_dim is not None and embeddings.shape[1] != embedding_dim:
raise ValueError(
f"Custom embedding_fn returned vectors with dimension {embeddings.shape[1]}, expected {embedding_dim}."
)

return embeddings

# Check if embeddings exist
def check_embeddings(chat_data: pd.DataFrame, vect_path: str, bert_path: str, need_sentence: bool,
need_sentiment: bool, regenerate_vectors: bool, message_col: str = "message"):
need_sentiment: bool, regenerate_vectors: bool, message_col: str = "message",
embedding_fn=None, embedding_dim=None):
"""
Check if embeddings and required lexicons exist, and generate them if they don't.

Expand All @@ -54,7 +130,13 @@ def check_embeddings(chat_data: pd.DataFrame, vect_path: str, bert_path: str, ne
:rtype: None
"""
if (regenerate_vectors or (not os.path.isfile(vect_path))) and need_sentence:
generate_vect(chat_data, vect_path, message_col)
generate_vect(
chat_data,
vect_path,
message_col,
embedding_fn=embedding_fn,
embedding_dim=embedding_dim,
)
if (regenerate_vectors or (not os.path.isfile(bert_path))) and need_sentiment:
generate_bert(chat_data, bert_path, message_col)

Expand All @@ -63,10 +145,22 @@ def check_embeddings(chat_data: pd.DataFrame, vect_path: str, bert_path: str, ne
# check whether the given vector and bert data matches length of chat data
if len(vector_df) != len(chat_data):
print("ERROR: The length of the vector data does not match the length of the chat data. Regenerating...")
generate_vect(chat_data, vect_path, message_col)
generate_vect(
chat_data,
vect_path,
message_col,
embedding_fn=embedding_fn,
embedding_dim=embedding_dim,
)
except FileNotFoundError: # It's OK if we don't have the path, if the sentence vectors are not necessary
if need_sentence:
generate_vect(chat_data, vect_path, message_col)
generate_vect(
chat_data,
vect_path,
message_col,
embedding_fn=embedding_fn,
embedding_dim=embedding_dim,
)

try:
bert_df = pd.read_csv(bert_path)
Expand Down Expand Up @@ -337,18 +431,24 @@ def str_to_vec(str_vec):
vector_list = [float(e) for e in str_vec[1:-1].split(',')]
return np.array(vector_list)

def get_nan_vector():
def get_nan_vector(embedding_fn=None, embedding_dim=None):
"""
Get a default value for an empty string (the "NaN vector") and returns it as a 1D np array.
"""
if embedding_dim is not None:
return np.zeros(embedding_dim, dtype=float)

if embedding_fn is not None:
return validate_embeddings(embedding_fn([""]), expected_rows=1, embedding_dim=embedding_dim)[0]

current_dir = os.path.dirname(__file__)
nan_vector_file_path = os.path.join(current_dir, '../features/assets/nan_vector.txt')
nan_vector_file_path = os.path.abspath(nan_vector_file_path)

with open(nan_vector_file_path, "r") as f:
return str_to_vec(f.read())

def generate_vect(chat_data, output_path, message_col, batch_size = 64):
def generate_vect(chat_data, output_path, message_col, batch_size = 64, embedding_fn=None, embedding_dim=None):
"""
Generates sentence vectors for the given chat data and saves them to a CSV file.

Expand All @@ -364,12 +464,21 @@ def generate_vect(chat_data, output_path, message_col, batch_size = 64):
:return: None
:rtype: None
"""
print(f"Generating SBERT sentence vectors...")
print("Generating sentence vectors...")

nan_vector = get_nan_vector()
encoder = embedding_fn or default_vector_encoder
nan_vector = get_nan_vector(embedding_fn=embedding_fn, embedding_dim=embedding_dim)
empty_to_nan = [text if text and text.strip() else None for text in chat_data[message_col].tolist()]
non_empty_texts = [text for text in empty_to_nan if text is not None]
all_embeddings = [emb for i in tqdm(range(0, len(non_empty_texts), batch_size)) for emb in model_vect.encode(non_empty_texts[i:i + batch_size])]
all_embeddings = []
for i in tqdm(range(0, len(non_empty_texts), batch_size)):
batch = non_empty_texts[i:i + batch_size]
batch_embeddings = validate_embeddings(
encoder(batch),
expected_rows=len(batch),
embedding_dim=embedding_dim,
)
all_embeddings.extend(batch_embeddings)
embeddings = np.tile(nan_vector, (len(empty_to_nan), 1)) # default embeddings to the NAN vector
non_empty_index = 0
for idx, text in enumerate(empty_to_nan):
Expand Down Expand Up @@ -423,6 +532,8 @@ def get_sentiment(texts):
:rtype: pd.DataFrame
"""

tokenizer, model_bert = get_sentiment_model()

# Handle and tokenize non-null and non-empty texts
texts_series = pd.Series(texts)
non_null_non_empty_texts = texts_series[texts_series.apply(lambda x: pd.notnull(x) and x.strip() != '')].tolist()
Expand All @@ -449,4 +560,4 @@ def get_sentiment(texts):
sent_df = pd.DataFrame(np.nan, index=texts_series.index, columns=['positive_bert', 'negative_bert', 'neutral_bert'])
sent_df.loc[texts_series.apply(lambda x: pd.notnull(x) and x.strip() != ''), ['positive_bert', 'negative_bert', 'neutral_bert']] = non_null_sent_df.values

return sent_df
return sent_df
Loading