image_management_api/simple_search_test.py
2025-05-25 16:52:38 +02:00

121 lines
4.6 KiB
Python

#!/usr/bin/env python3
"""
Simple test script to embed text and search Qdrant without filters
"""
import os
import sys
import asyncio
import logging
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
async def simple_search_test():
"""Simple test: embed text and search without filters"""
try:
# Import services
from src.services.vector_db import VectorDatabaseService
from src.services.embedding_service import EmbeddingService
# Initialize services
logger.info("Initializing services...")
vector_db = VectorDatabaseService()
embedding_service = EmbeddingService()
# Test 1: Generate text embedding
logger.info("=== Generating Text Embedding ===")
search_query = "rectangle"
text_embedding = await embedding_service.generate_text_embedding(search_query)
if text_embedding:
logger.info(f"✓ Generated embedding for '{search_query}' - length: {len(text_embedding)}")
else:
logger.error("✗ Failed to generate text embedding")
return False
# Test 2: Search without any filters
logger.info("=== Searching Qdrant (No Filters) ===")
# Try different thresholds to see what we get
thresholds = [0.1, 0.3, 0.5, 0.65, 0.8]
for threshold in thresholds:
logger.info(f"\n--- Threshold: {threshold} ---")
search_results = vector_db.search_similar_images(
query_vector=text_embedding,
limit=10,
score_threshold=threshold
# No filter_conditions = search everything
)
logger.info(f"Found {len(search_results)} results")
# Show top 3 results
for i, result in enumerate(search_results[:3]):
logger.info(f" {i+1}. Score: {result['score']:.4f} | ID: {result['image_id']} | File: {result['metadata'].get('filename', 'N/A')}")
# Test 3: Very low threshold to see all data
logger.info("\n=== All Data (Threshold 0.0) ===")
all_results = vector_db.search_similar_images(
query_vector=text_embedding,
limit=50,
score_threshold=0.0 # Get everything
)
logger.info(f"Total vectors in collection: {len(all_results)}")
# Test 4: With team filtering (like the API does)
logger.info("\n=== Testing Team Filtering ===")
test_team_id = "68330a29472a0704d2f77063" # From server logs
filtered_results = vector_db.search_similar_images(
query_vector=text_embedding,
limit=50,
score_threshold=0.0,
filter_conditions={"team_id": test_team_id}
)
logger.info(f"Results with team filter ({test_team_id}): {len(filtered_results)}")
# Show metadata for all results to see team_ids
logger.info("\n=== Checking Team IDs in Vector DB ===")
for i, result in enumerate(all_results):
metadata = result.get('metadata', {})
team_id = metadata.get('team_id', 'N/A')
logger.info(f" {i+1}. Image ID: {result['image_id']} | Team ID: {team_id}")
# Show some stats
if all_results:
scores = [r['score'] for r in all_results]
logger.info(f"Score range: {min(scores):.4f} to {max(scores):.4f}")
logger.info(f"Average score: {sum(scores)/len(scores):.4f}")
# Show top 5 and bottom 5
logger.info("\nTop 5 results:")
for i, result in enumerate(all_results[:5]):
logger.info(f" {i+1}. Score: {result['score']:.4f} | ID: {result['image_id']}")
if len(all_results) > 5:
logger.info("\nBottom 5 results:")
for i, result in enumerate(all_results[-5:]):
logger.info(f" {len(all_results)-4+i}. Score: {result['score']:.4f} | ID: {result['image_id']}")
logger.info("\n✓ Simple search test completed!")
return True
except Exception as e:
logger.error(f"✗ Test failed: {e}")
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
success = asyncio.run(simple_search_test())
sys.exit(0 if success else 1)