Skip to content
This repository was archived by the owner on Jun 5, 2025. It is now read-only.

Validate persona description is sufficiently different #1225

Merged
merged 1 commit into from
Mar 5, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/codegate/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,14 @@ class Config:
force_certs: bool = False

max_fim_hash_lifetime: int = 60 * 5 # Time in seconds. Default is 5 minutes.

# Min value is 0 (max similarity), max value is 2 (orthogonal)
# The value 0.75 was found through experimentation. See /tests/muxing/test_semantic_router.py
# It's the threshold value to determine if a query matches a persona.
persona_threshold = 0.75
# The value 0.3 was found through experimentation. See /tests/muxing/test_semantic_router.py
# It's the threshold value to determine if a persona description is similar to existing personas
persona_diff_desc_threshold = 0.3

# Provider URLs with defaults
provider_urls: Dict[str, str] = field(default_factory=lambda: DEFAULT_PROVIDER_URLS.copy())
Expand Down
20 changes: 20 additions & 0 deletions src/codegate/db/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -1004,6 +1004,26 @@ async def get_persona_by_name(self, persona_name: str) -> Optional[Persona]:
)
return personas[0] if personas else None

async def get_distance_to_existing_personas(
self, query_embedding: np.ndarray
) -> List[PersonaDistance]:
"""
Get the distance between a persona and a query embedding.
"""
sql = """
SELECT
id,
name,
description,
vec_distance_cosine(description_embedding, :query_embedding) as distance
FROM personas
"""
conditions = {"query_embedding": query_embedding}
persona_distances = await self._exec_vec_db_query_to_pydantic(
sql, conditions, PersonaDistance
)
return persona_distances

async def get_distance_to_persona(
self, persona_id: str, query_embedding: np.ndarray
) -> PersonaDistance:
Expand Down
2 changes: 2 additions & 0 deletions src/codegate/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,8 @@ class MuxRule(BaseModel):

def nd_array_custom_before_validator(x):
# custome before validation logic
if isinstance(x, bytes):
return np.frombuffer(x, dtype=np.float32)
return x


Expand Down
31 changes: 31 additions & 0 deletions src/codegate/muxing/semantic_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ class PersonaDoesNotExistError(Exception):
pass


class PersonaSimilarDescriptionError(Exception):
pass


class SemanticRouter:

def __init__(self):
Expand All @@ -36,6 +40,7 @@ def __init__(self):
self._embeddings_model = f"{conf.model_base_path}/{conf.embedding_model}"
self._n_gpu = conf.chat_model_n_gpu_layers
self._persona_threshold = conf.persona_threshold
self._persona_diff_desc_threshold = conf.persona_diff_desc_threshold
self._db_recorder = DbRecorder()
self._db_reader = DbReader()

Expand Down Expand Up @@ -105,12 +110,38 @@ async def _embed_text(self, text: str) -> np.ndarray:
logger.debug("Text embedded in semantic routing", text=cleaned_text[:50])
return np.array(embed_list[0], dtype=np.float32)

async def _is_persona_description_diff(self, emb_persona_desc: np.ndarray) -> bool:
"""
Check if the persona description is different enough from existing personas.
"""
# The distance calculation is done in the database
persona_distances = await self._db_reader.get_distance_to_existing_personas(
emb_persona_desc
)
if not persona_distances:
return True

for persona_distance in persona_distances:
logger.info(
f"Persona description distance to {persona_distance.name}",
distance=persona_distance.distance,
)
# If the distance is less than the threshold, the persona description is too similar
if persona_distance.distance < self._persona_diff_desc_threshold:
return False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How expensive is it to parallelize this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried 2 approaches to get the distances:

  1. Direct query to sqlite and get the distances
  2. Query to get all the personas, then use numpy matrices operations to get the distance

The result of the experiment was that it didn't matter, both of them were practically onpar.

For this specific comparison for just checking the threshold probably makes no difference to parallelize it with matrices operations. I don't expect someone having 1000 different personas in their DB. If it happens then yes, we would need for optimization. Probably with a sensible amount of personas (<10) really makes no difference

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's take an extreme but not unreasonable example: 100 personas. Would we start seeing issues in this case?

return True

async def add_persona(self, persona_name: str, persona_desc: str) -> None:
"""
Add a new persona to the database. The persona description is embedded
and stored in the database.
"""
emb_persona_desc = await self._embed_text(persona_desc)
if not await self._is_persona_description_diff(emb_persona_desc):
raise PersonaSimilarDescriptionError(
"The persona description is too similar to existing personas."
)

new_persona = db_models.PersonaEmbedding(
id=str(uuid.uuid4()),
name=persona_name,
Expand Down
Loading