diff --git a/README.md b/README.md index 42986a2..0f9f25c 100644 --- a/README.md +++ b/README.md @@ -407,9 +407,9 @@ The API provides the following main endpoints with their authentication and pagi - **Query Parameters:** - `q` (required) - Search query - `limit` (default: 10, min: 1, max: 50) - Number of results - - `threshold` (default: 0.7, min: 0.0, max: 1.0) - Similarity threshold + - `similarity_threshold` (default: 0.7, min: 0.0, max: 1.0) - Similarity threshold - `collection_id` (optional) - Filter by collection - - **Response includes:** `results`, `total`, `limit`, `threshold`, `query` + - **Response includes:** `results`, `total`, `limit`, `similarity_threshold`, `query` - `POST /api/v1/search` - Advanced search with same pagination ### šŸ”‘ **Authentication Model** diff --git a/client/js/api.js b/client/js/api.js index 03b228e..becdb14 100644 --- a/client/js/api.js +++ b/client/js/api.js @@ -180,7 +180,7 @@ class ApiClient { async searchImages(query, similarityThreshold, maxResults = 20) { const searchData = { query, - threshold: similarityThreshold, + similarity_threshold: similarityThreshold, limit: maxResults }; diff --git a/deployment/cloud-function/main.py b/deployment/cloud-function/main.py index c13ba15..8584afa 100644 --- a/deployment/cloud-function/main.py +++ b/deployment/cloud-function/main.py @@ -224,6 +224,29 @@ def generate_image_embeddings(image_data: bytes) -> Optional[np.ndarray]: Numpy array of embeddings or None if failed """ try: + # Basic validation of image data + if not image_data or len(image_data) == 0: + logger.error("Empty image data provided") + return None + + # Check image size (limit to 10MB) + if len(image_data) > 10 * 1024 * 1024: + logger.warning(f"Large image detected: {len(image_data)} bytes") + + # Validate image format using PIL + try: + pil_image = Image.open(io.BytesIO(image_data)) + logger.info(f"Image format: {pil_image.format}, size: {pil_image.size}, mode: {pil_image.mode}") + + # Check for blank/empty images + if pil_image.size[0] == 0 or pil_image.size[1] == 0: + logger.error("Image has zero dimensions") + return None + + except Exception as e: + logger.error(f"Invalid image format: {e}") + return None + # Create Vertex AI image object vertex_image = VertexImage(image_data) @@ -238,15 +261,23 @@ def generate_image_embeddings(image_data: bytes) -> Optional[np.ndarray]: # Get the image embedding vector embedding_vector = embeddings.image_embedding - # Convert to numpy array + # Convert to numpy array - DO NOT normalize Vertex AI embeddings + # This must match the behavior in the main embedding service embeddings_array = np.array(embedding_vector, dtype=np.float32) - # Normalize the feature vector - norm = np.linalg.norm(embeddings_array) - if norm > 0: - embeddings_array = embeddings_array / norm + # Validate embedding quality + if np.any(np.isnan(embeddings_array)) or np.any(np.isinf(embeddings_array)): + logger.error("Generated embeddings contain NaN or infinite values") + return None + + # Check if embedding is mostly zeros (might indicate processing issue) + zero_ratio = np.sum(embeddings_array == 0.0) / len(embeddings_array) + if zero_ratio > 0.9: + logger.warning(f"Embedding is {zero_ratio*100:.1f}% zeros - might indicate processing issue") logger.info(f"Generated embeddings with shape: {embeddings_array.shape}") + logger.info(f"Embedding stats - min: {embeddings_array.min():.6f}, max: {embeddings_array.max():.6f}, norm: {np.linalg.norm(embeddings_array):.6f}") + return embeddings_array except Exception as e: diff --git a/deployment/terraform/terraform.tfstate b/deployment/terraform/terraform.tfstate index a988b87..e97d695 100644 --- a/deployment/terraform/terraform.tfstate +++ b/deployment/terraform/terraform.tfstate @@ -1,7 +1,7 @@ { "version": 4, "terraform_version": "1.10.1", - "serial": 445, + "serial": 451, "lineage": "a183cd95-f987-8698-c6dd-84e933c394a5", "outputs": { "cloud_function_name": { @@ -98,16 +98,16 @@ "attributes": { "exclude_symlink_directories": null, "excludes": null, - "id": "0cfb36e4e396f12e3ad2944c44b083bff2224ad5", - "output_base64sha256": "uMoV4IM2IuGcRtqeI7wbu3OsTmvDx1ohDDxkEE5NY9U=", - "output_base64sha512": "BQB+g3lC0+y5vOx6KHh4AWCeHk3D2nmdgE8JrFaiPlCWV6KsrMdANGyKeZ/aFmvGjbFw7MGQD4s0u/tn+viVAA==", + "id": "fe2be242d7b603458e099720b9d99e319952abb0", + "output_base64sha256": "+3DKsNV8OT4cv8S2rftR1Pe8UUHzJNn2hhWkS8FcN9I=", + "output_base64sha512": "2+zPRbyYxFDTa0LJz6TmZOajp10eJNQla9tK0S8d++4T6vXgt7nKSy81GO+V4ttWYopmipOvRYJe5YZ4WvmecA==", "output_file_mode": null, - "output_md5": "b532cf3ff81d62dd7dec013e486931aa", + "output_md5": "8e8a10dd705e45fe8645782a20d999a3", "output_path": "./function-source.zip", - "output_sha": "0cfb36e4e396f12e3ad2944c44b083bff2224ad5", - "output_sha256": "b8ca15e0833622e19c46da9e23bc1bbb73ac4e6bc3c75a210c3c64104e4d63d5", - "output_sha512": "05007e837942d3ecb9bcec7a28787801609e1e4dc3da799d804f09ac56a23e509657a2acacc740346c8a799fda166bc68db170ecc1900f8b34bbfb67faf89500", - "output_size": 69764346, + "output_sha": "fe2be242d7b603458e099720b9d99e319952abb0", + "output_sha256": "fb70cab0d57c393e1cbfc4b6adfb51d4f7bc5141f324d9f68615a44bc15c37d2", + "output_sha512": "dbeccf45bc98c450d36b42c9cfa4e664e6a3a75d1e24d4256bdb4ad12f1dfbee13eaf5e0b7b9ca4b2f3518ef95e2db56628a668a93af45825ee586785af99e70", + "output_size": 69764811, "source": [], "source_content": null, "source_content_filename": null, @@ -172,7 +172,7 @@ "effective_annotations": { "run.googleapis.com/ingress": "all", "run.googleapis.com/ingress-status": "all", - "run.googleapis.com/operation-id": "a9aeb6de-fdd6-43b2-93f8-8b7f72afab4c", + "run.googleapis.com/operation-id": "2b1374b6-6b03-4d99-a76c-b8e751f46df0", "run.googleapis.com/urls": "[\"https://sereact-761163285547.us-central1.run.app\",\"https://sereact-p64zpdtkta-uc.a.run.app\"]", "serving.knative.dev/creator": "johnpccd3@gmail.com", "serving.knative.dev/lastModifier": "johnpccd3@gmail.com" @@ -184,12 +184,12 @@ "generation": 1, "labels": null, "namespace": "gen-lang-client-0424120530", - "resource_version": "AAY19MELEOc", + "resource_version": "AAY191YVk4g", "self_link": "/apis/serving.knative.dev/v1/namespaces/761163285547/services/sereact", "terraform_labels": { "goog-terraform-provisioned": "true" }, - "uid": "8c8be11c-c607-4caa-a65e-c552ec445882" + "uid": "d8b0e29e-2db1-4f23-8b6c-d7238a9a5f89" } ], "name": "sereact", @@ -216,14 +216,14 @@ "type": "RoutesReady" } ], - "latest_created_revision_name": "sereact-00001-z4g", - "latest_ready_revision_name": "sereact-00001-z4g", + "latest_created_revision_name": "sereact-00001-htg", + "latest_ready_revision_name": "sereact-00001-htg", "observed_generation": 1, "traffic": [ { "latest_revision": true, "percent": 100, - "revision_name": "sereact-00001-z4g", + "revision_name": "sereact-00001-htg", "tag": "", "url": "" } @@ -440,7 +440,7 @@ "schema_version": 0, "attributes": { "condition": [], - "etag": "BwY19MG70Fs=", + "etag": "BwY191avNJc=", "id": "v1/projects/gen-lang-client-0424120530/locations/us-central1/services/sereact/roles/run.invoker/allUsers", "location": "us-central1", "member": "allUsers", @@ -474,7 +474,7 @@ "automatic_update_policy": [ {} ], - "build": "projects/761163285547/locations/us-central1/builds/ae88c918-6bb3-4aef-a56f-270f48f73049", + "build": "projects/761163285547/locations/us-central1/builds/3804d611-8e49-491b-b994-4ed7fc528e92", "docker_repository": "projects/gen-lang-client-0424120530/locations/us-central1/repositories/gcf-artifacts", "entry_point": "process_image_embedding", "environment_variables": {}, @@ -487,8 +487,8 @@ "storage_source": [ { "bucket": "gen-lang-client-0424120530-cloud-function-source", - "generation": 1748175166697242, - "object": "function-source-b532cf3ff81d62dd7dec013e486931aa.zip" + "generation": 1748186294324568, + "object": "function-source-8e8a10dd705e45fe8645782a20d999a3.zip" } ] } @@ -508,7 +508,7 @@ "pubsub_topic": "projects/gen-lang-client-0424120530/topics/image-processing-topic", "retry_policy": "RETRY_POLICY_RETRY", "service_account_email": "761163285547-compute@developer.gserviceaccount.com", - "trigger": "projects/gen-lang-client-0424120530/locations/us-central1/triggers/process-image-embedding-645734", + "trigger": "projects/gen-lang-client-0424120530/locations/us-central1/triggers/process-image-embedding-873142", "trigger_region": "us-central1" } ], @@ -559,7 +559,7 @@ "goog-terraform-provisioned": "true" }, "timeouts": null, - "update_time": "2025-05-25T12:15:22.215124150Z", + "update_time": "2025-05-25T15:20:42.834818284Z", "url": "https://us-central1-gen-lang-client-0424120530.cloudfunctions.net/process-image-embedding" }, "sensitive_attributes": [ @@ -875,8 +875,8 @@ "database_edition": "STANDARD", "delete_protection_state": "DELETE_PROTECTION_DISABLED", "deletion_policy": "ABANDON", - "earliest_version_time": "2025-05-25T11:12:43.126081Z", - "etag": "IPjb6fzLvo0DMKrW4vCEvY0D", + "earliest_version_time": "2025-05-25T14:17:09.893967Z", + "etag": "IKfR75n1vo0DMKrW4vCEvY0D", "id": "projects/gen-lang-client-0424120530/databases/sereact-imagedb", "key_prefix": "", "location_id": "us-central1", @@ -1514,21 +1514,21 @@ "content_encoding": "", "content_language": "", "content_type": "application/zip", - "crc32c": "EgiVnQ==", + "crc32c": "cMTkTw==", "customer_encryption": [], - "detect_md5hash": "tTLPP/gdYt197AE+SGkxqg==", + "detect_md5hash": "jooQ3XBeRf6GRXgqINmZow==", "event_based_hold": false, - "generation": 1748174860755303, - "id": "gen-lang-client-0424120530-cloud-function-source-function-source-b532cf3ff81d62dd7dec013e486931aa.zip", + "generation": 1748186292978895, + "id": "gen-lang-client-0424120530-cloud-function-source-function-source-8e8a10dd705e45fe8645782a20d999a3.zip", "kms_key_name": "", - "md5hash": "tTLPP/gdYt197AE+SGkxqg==", - "md5hexhash": "b532cf3ff81d62dd7dec013e486931aa", - "media_link": "https://storage.googleapis.com/download/storage/v1/b/gen-lang-client-0424120530-cloud-function-source/o/function-source-b532cf3ff81d62dd7dec013e486931aa.zip?generation=1748174860755303\u0026alt=media", - "metadata": {}, - "name": "function-source-b532cf3ff81d62dd7dec013e486931aa.zip", - "output_name": "function-source-b532cf3ff81d62dd7dec013e486931aa.zip", + "md5hash": "jooQ3XBeRf6GRXgqINmZow==", + "md5hexhash": "8e8a10dd705e45fe8645782a20d999a3", + "media_link": "https://storage.googleapis.com/download/storage/v1/b/gen-lang-client-0424120530-cloud-function-source/o/function-source-8e8a10dd705e45fe8645782a20d999a3.zip?generation=1748186292978895\u0026alt=media", + "metadata": null, + "name": "function-source-8e8a10dd705e45fe8645782a20d999a3.zip", + "output_name": "function-source-8e8a10dd705e45fe8645782a20d999a3.zip", "retention": [], - "self_link": "https://www.googleapis.com/storage/v1/b/gen-lang-client-0424120530-cloud-function-source/o/function-source-b532cf3ff81d62dd7dec013e486931aa.zip", + "self_link": "https://www.googleapis.com/storage/v1/b/gen-lang-client-0424120530-cloud-function-source/o/function-source-8e8a10dd705e45fe8645782a20d999a3.zip", "source": "./function-source.zip", "storage_class": "STANDARD", "temporary_hold": false, diff --git a/deployment/terraform/terraform.tfstate.backup b/deployment/terraform/terraform.tfstate.backup index bfe8cdd..a988b87 100644 --- a/deployment/terraform/terraform.tfstate.backup +++ b/deployment/terraform/terraform.tfstate.backup @@ -1,7 +1,7 @@ { "version": 4, "terraform_version": "1.10.1", - "serial": 441, + "serial": 445, "lineage": "a183cd95-f987-8698-c6dd-84e933c394a5", "outputs": { "cloud_function_name": { @@ -172,7 +172,7 @@ "effective_annotations": { "run.googleapis.com/ingress": "all", "run.googleapis.com/ingress-status": "all", - "run.googleapis.com/operation-id": "0f195b05-99ac-4d28-b5fe-2d3dea289124", + "run.googleapis.com/operation-id": "a9aeb6de-fdd6-43b2-93f8-8b7f72afab4c", "run.googleapis.com/urls": "[\"https://sereact-761163285547.us-central1.run.app\",\"https://sereact-p64zpdtkta-uc.a.run.app\"]", "serving.knative.dev/creator": "johnpccd3@gmail.com", "serving.knative.dev/lastModifier": "johnpccd3@gmail.com" @@ -182,14 +182,14 @@ "goog-terraform-provisioned": "true" }, "generation": 1, - "labels": {}, + "labels": null, "namespace": "gen-lang-client-0424120530", - "resource_version": "AAY189oNgAQ", + "resource_version": "AAY19MELEOc", "self_link": "/apis/serving.knative.dev/v1/namespaces/761163285547/services/sereact", "terraform_labels": { "goog-terraform-provisioned": "true" }, - "uid": "20e61eb3-6217-40e8-8ae5-45111d31bbda" + "uid": "8c8be11c-c607-4caa-a65e-c552ec445882" } ], "name": "sereact", @@ -216,14 +216,14 @@ "type": "RoutesReady" } ], - "latest_created_revision_name": "sereact-00001-2lz", - "latest_ready_revision_name": "sereact-00001-2lz", + "latest_created_revision_name": "sereact-00001-z4g", + "latest_ready_revision_name": "sereact-00001-z4g", "observed_generation": 1, "traffic": [ { "latest_revision": true, "percent": 100, - "revision_name": "sereact-00001-2lz", + "revision_name": "sereact-00001-z4g", "tag": "", "url": "" } @@ -256,8 +256,8 @@ "container_concurrency": 80, "containers": [ { - "args": [], - "command": [], + "args": null, + "command": null, "env": [ { "name": "API_KEY_SECRET", @@ -337,7 +337,7 @@ "cpu": "1", "memory": "1Gi" }, - "requests": {} + "requests": null } ], "startup_probe": [ @@ -359,7 +359,7 @@ "working_dir": "" } ], - "node_selector": {}, + "node_selector": null, "service_account_name": "761163285547-compute@developer.gserviceaccount.com", "serving_state": "", "timeout_seconds": 300, @@ -440,7 +440,7 @@ "schema_version": 0, "attributes": { "condition": [], - "etag": "BwY189qg+AA=", + "etag": "BwY19MG70Fs=", "id": "v1/projects/gen-lang-client-0424120530/locations/us-central1/services/sereact/roles/run.invoker/allUsers", "location": "us-central1", "member": "allUsers", @@ -474,7 +474,7 @@ "automatic_update_policy": [ {} ], - "build": "projects/761163285547/locations/us-central1/builds/aab08c74-df86-4cd7-9176-4ff267cab3e6", + "build": "projects/761163285547/locations/us-central1/builds/ae88c918-6bb3-4aef-a56f-270f48f73049", "docker_repository": "projects/gen-lang-client-0424120530/locations/us-central1/repositories/gcf-artifacts", "entry_point": "process_image_embedding", "environment_variables": {}, @@ -487,8 +487,8 @@ "storage_source": [ { "bucket": "gen-lang-client-0424120530-cloud-function-source", - "generation": 1748171376287077, - "object": "function-source-58a2b7fe53bb2c8c921405cc965d635c.zip" + "generation": 1748175166697242, + "object": "function-source-b532cf3ff81d62dd7dec013e486931aa.zip" } ] } @@ -508,13 +508,13 @@ "pubsub_topic": "projects/gen-lang-client-0424120530/topics/image-processing-topic", "retry_policy": "RETRY_POLICY_RETRY", "service_account_email": "761163285547-compute@developer.gserviceaccount.com", - "trigger": "projects/gen-lang-client-0424120530/locations/us-central1/triggers/process-image-embedding-013009", + "trigger": "projects/gen-lang-client-0424120530/locations/us-central1/triggers/process-image-embedding-645734", "trigger_region": "us-central1" } ], "id": "projects/gen-lang-client-0424120530/locations/us-central1/functions/process-image-embedding", "kms_key_name": "", - "labels": {}, + "labels": null, "location": "us-central1", "name": "process-image-embedding", "project": "gen-lang-client-0424120530", @@ -559,7 +559,7 @@ "goog-terraform-provisioned": "true" }, "timeouts": null, - "update_time": "2025-05-25T11:13:04.212724797Z", + "update_time": "2025-05-25T12:15:22.215124150Z", "url": "https://us-central1-gen-lang-client-0424120530.cloudfunctions.net/process-image-embedding" }, "sensitive_attributes": [ @@ -588,6 +588,7 @@ } ] ], + "private": "eyJlMmJmYjczMC1lY2FhLTExZTYtOGY4OC0zNDM2M2JjN2M0YzAiOnsiY3JlYXRlIjozNjAwMDAwMDAwMDAwLCJkZWxldGUiOjM2MDAwMDAwMDAwMDAsInVwZGF0ZSI6MzYwMDAwMDAwMDAwMH19", "dependencies": [ "data.archive_file.function_source", "data.google_project.current", @@ -602,13 +603,6 @@ } ] }, - { - "mode": "managed", - "type": "google_compute_address", - "name": "vector_db_static_ip", - "provider": "provider[\"registry.terraform.io/hashicorp/google\"]", - "instances": [] - }, { "mode": "managed", "type": "google_compute_firewall", @@ -818,7 +812,18 @@ [ { "type": "get_attr", - "value": "metadata_startup_script" + "value": "boot_disk" + }, + { + "type": "index", + "value": { + "value": 0, + "type": "number" + } + }, + { + "type": "get_attr", + "value": "disk_encryption_key_rsa" } ], [ @@ -841,18 +846,7 @@ [ { "type": "get_attr", - "value": "boot_disk" - }, - { - "type": "index", - "value": { - "value": 0, - "type": "number" - } - }, - { - "type": "get_attr", - "value": "disk_encryption_key_rsa" + "value": "metadata_startup_script" } ] ], @@ -881,8 +875,8 @@ "database_edition": "STANDARD", "delete_protection_state": "DELETE_PROTECTION_DISABLED", "deletion_policy": "ABANDON", - "earliest_version_time": "2025-05-25T11:07:20.673706Z", - "etag": "IIrliOPKvo0DMKrW4vCEvY0D", + "earliest_version_time": "2025-05-25T11:12:43.126081Z", + "etag": "IPjb6fzLvo0DMKrW4vCEvY0D", "id": "projects/gen-lang-client-0424120530/databases/sereact-imagedb", "key_prefix": "", "location_id": "us-central1", @@ -1530,7 +1524,7 @@ "md5hash": "tTLPP/gdYt197AE+SGkxqg==", "md5hexhash": "b532cf3ff81d62dd7dec013e486931aa", "media_link": "https://storage.googleapis.com/download/storage/v1/b/gen-lang-client-0424120530-cloud-function-source/o/function-source-b532cf3ff81d62dd7dec013e486931aa.zip?generation=1748174860755303\u0026alt=media", - "metadata": null, + "metadata": {}, "name": "function-source-b532cf3ff81d62dd7dec013e486931aa.zip", "output_name": "function-source-b532cf3ff81d62dd7dec013e486931aa.zip", "retention": [], diff --git a/simple_search_test.py b/simple_search_test.py deleted file mode 100644 index eafc082..0000000 --- a/simple_search_test.py +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env python3 -""" -Simple test script to embed text and search Qdrant without filters -""" - -import os -import sys -import asyncio -import logging -from dotenv import load_dotenv - -# Load environment variables -load_dotenv() - -# Set up logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -async def simple_search_test(): - """Simple test: embed text and search without filters""" - - try: - # Import services - from src.services.vector_db import VectorDatabaseService - from src.services.embedding_service import EmbeddingService - - # Initialize services - logger.info("Initializing services...") - vector_db = VectorDatabaseService() - embedding_service = EmbeddingService() - - # Test 1: Generate text embedding - logger.info("=== Generating Text Embedding ===") - search_query = "rectangle" - text_embedding = await embedding_service.generate_text_embedding(search_query) - - if text_embedding: - logger.info(f"āœ“ Generated embedding for '{search_query}' - length: {len(text_embedding)}") - else: - logger.error("āœ— Failed to generate text embedding") - return False - - # Test 2: Search without any filters - logger.info("=== Searching Qdrant (No Filters) ===") - - # Try different thresholds to see what we get - thresholds = [0.1, 0.3, 0.5, 0.65, 0.8] - - for threshold in thresholds: - logger.info(f"\n--- Threshold: {threshold} ---") - - search_results = vector_db.search_similar_images( - query_vector=text_embedding, - limit=10, - score_threshold=threshold - # No filter_conditions = search everything - ) - - logger.info(f"Found {len(search_results)} results") - - # Show top 3 results - for i, result in enumerate(search_results[:3]): - logger.info(f" {i+1}. Score: {result['score']:.4f} | ID: {result['image_id']} | File: {result['metadata'].get('filename', 'N/A')}") - - # Test 3: Very low threshold to see all data - logger.info("\n=== All Data (Threshold 0.0) ===") - all_results = vector_db.search_similar_images( - query_vector=text_embedding, - limit=50, - score_threshold=0.0 # Get everything - ) - - logger.info(f"Total vectors in collection: {len(all_results)}") - - # Test 4: With team filtering (like the API does) - logger.info("\n=== Testing Team Filtering ===") - test_team_id = "68330a29472a0704d2f77063" # From server logs - filtered_results = vector_db.search_similar_images( - query_vector=text_embedding, - limit=50, - score_threshold=0.0, - filter_conditions={"team_id": test_team_id} - ) - - logger.info(f"Results with team filter ({test_team_id}): {len(filtered_results)}") - - # Show metadata for all results to see team_ids - logger.info("\n=== Checking Team IDs in Vector DB ===") - for i, result in enumerate(all_results): - metadata = result.get('metadata', {}) - team_id = metadata.get('team_id', 'N/A') - logger.info(f" {i+1}. Image ID: {result['image_id']} | Team ID: {team_id}") - - # Show some stats - if all_results: - scores = [r['score'] for r in all_results] - logger.info(f"Score range: {min(scores):.4f} to {max(scores):.4f}") - logger.info(f"Average score: {sum(scores)/len(scores):.4f}") - - # Show top 5 and bottom 5 - logger.info("\nTop 5 results:") - for i, result in enumerate(all_results[:5]): - logger.info(f" {i+1}. Score: {result['score']:.4f} | ID: {result['image_id']}") - - if len(all_results) > 5: - logger.info("\nBottom 5 results:") - for i, result in enumerate(all_results[-5:]): - logger.info(f" {len(all_results)-4+i}. Score: {result['score']:.4f} | ID: {result['image_id']}") - - logger.info("\nāœ“ Simple search test completed!") - return True - - except Exception as e: - logger.error(f"āœ— Test failed: {e}") - import traceback - traceback.print_exc() - return False - -if __name__ == "__main__": - success = asyncio.run(simple_search_test()) - sys.exit(0 if success else 1) \ No newline at end of file diff --git a/src/api/v1/images.py b/src/api/v1/images.py index 3e80001..96e1029 100644 --- a/src/api/v1/images.py +++ b/src/api/v1/images.py @@ -76,7 +76,7 @@ async def upload_image( file_size=file_size, content_type=content_type, storage_path=storage_path, - public_url="", # Will be set after we have the image ID + public_url=None, # Will be set after we have the image ID team_id=current_user.team_id, uploader_id=current_user.id, description=description, diff --git a/src/api/v1/search.py b/src/api/v1/search.py index cdc5b84..53636be 100644 --- a/src/api/v1/search.py +++ b/src/api/v1/search.py @@ -33,7 +33,7 @@ async def search_images( request: Request, q: str = Query(..., description="Search query"), limit: int = Query(10, ge=1, le=50, description="Number of results to return"), - threshold: float = Query(0.65, ge=0.0, le=1.0, description="Similarity threshold"), + similarity_threshold: float = Query(0.65, ge=0.0, le=1.0, description="Similarity threshold"), collection_id: Optional[str] = Query(None, description="Filter by collection ID"), current_user: UserModel = Depends(get_current_user) ): @@ -46,7 +46,7 @@ async def search_images( "method": request.method, "query": q, "limit": limit, - "threshold": threshold + "similarity_threshold": similarity_threshold }, user_id=str(current_user.id), team_id=str(current_user.team_id) @@ -62,7 +62,7 @@ async def search_images( search_results = get_vector_db_service().search_similar_images( query_vector=query_embedding, limit=limit, - score_threshold=threshold, + similarity_threshold=similarity_threshold, filter_conditions={"team_id": str(current_user.team_id)} if current_user.team_id else None ) @@ -72,12 +72,12 @@ async def search_images( results=[], total=0, limit=limit, - threshold=threshold + similarity_threshold=similarity_threshold ) # Get image IDs and scores from search results image_ids = [result['image_id'] for result in search_results if result['image_id']] - scores = {result['image_id']: result['score'] for result in search_results if result['image_id']} + scores = {result['image_id']: result['similarity_score'] for result in search_results if result['image_id']} # Get image metadata from database images = await image_repository.get_by_ids(image_ids) @@ -123,7 +123,7 @@ async def search_images( results=results, total=len(results), limit=limit, - threshold=threshold + similarity_threshold=similarity_threshold ) except Exception as e: @@ -160,11 +160,11 @@ async def search_images_advanced( logger.info(f"Generated embedding with length: {len(query_embedding)}") # Search in vector database - logger.info(f"Searching vector database with threshold: {search_request.threshold}") + logger.info(f"Searching vector database with similarity_threshold: {search_request.similarity_threshold}") search_results = get_vector_db_service().search_similar_images( query_vector=query_embedding, limit=search_request.limit, - score_threshold=search_request.threshold, + similarity_threshold=search_request.similarity_threshold, filter_conditions={"team_id": str(current_user.team_id)} if current_user.team_id else None ) @@ -177,12 +177,12 @@ async def search_images_advanced( results=[], total=0, limit=search_request.limit, - threshold=search_request.threshold + similarity_threshold=search_request.similarity_threshold ) # Get image IDs and scores from search results image_ids = [result['image_id'] for result in search_results if result['image_id']] - scores = {result['image_id']: result['score'] for result in search_results if result['image_id']} + scores = {result['image_id']: result['similarity_score'] for result in search_results if result['image_id']} logger.info(f"Extracted {len(image_ids)} image IDs: {image_ids}") @@ -247,7 +247,7 @@ async def search_images_advanced( results=results, total=len(results), limit=search_request.limit, - threshold=search_request.threshold + similarity_threshold=search_request.similarity_threshold ) except Exception as e: diff --git a/src/schemas/search.py b/src/schemas/search.py index bcbf3b0..95ae11c 100644 --- a/src/schemas/search.py +++ b/src/schemas/search.py @@ -1,6 +1,6 @@ from typing import List, Optional, ClassVar from datetime import datetime -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator from src.schemas.image import ImageResponse @@ -8,18 +8,25 @@ class SearchRequest(BaseModel): """Schema for advanced search request""" query: str = Field(..., description="Search query", min_length=1) limit: int = Field(10, description="Maximum number of results", ge=1, le=50) - threshold: float = Field(0.7, description="Similarity threshold", ge=0.0, le=1.0) + similarity_threshold: Optional[float] = Field(None, description="Similarity threshold", ge=0.0, le=1.0) collection_id: Optional[str] = Field(None, description="Filter by collection ID") date_from: Optional[datetime] = Field(None, description="Filter images uploaded after this date") date_to: Optional[datetime] = Field(None, description="Filter images uploaded before this date") uploader_id: Optional[str] = Field(None, description="Filter by uploader ID") + @field_validator('similarity_threshold') + @classmethod + def set_default_similarity_threshold(cls, v): + if v is None: + return 0.7 + return v + model_config: ClassVar[dict] = { "json_schema_extra": { "example": { "query": "mountain sunset", "limit": 10, - "threshold": 0.7, + "similarity_threshold": 0.7, "collection_id": "507f1f77bcf86cd799439044", "date_from": "2023-01-01T00:00:00", "date_to": "2023-12-31T23:59:59", @@ -34,7 +41,7 @@ class SearchResponse(BaseModel): results: List[ImageResponse] total: int limit: int - threshold: float + similarity_threshold: float model_config: ClassVar[dict] = { "json_schema_extra": { @@ -66,7 +73,7 @@ class SearchResponse(BaseModel): ], "total": 1, "limit": 10, - "threshold": 0.7 + "similarity_threshold": 0.7 } } } \ No newline at end of file diff --git a/src/services/vector_db.py b/src/services/vector_db.py index 55b42f2..6d79f18 100644 --- a/src/services/vector_db.py +++ b/src/services/vector_db.py @@ -139,7 +139,7 @@ class VectorDatabaseService: self, query_vector: List[float], limit: int = 10, - score_threshold: float = 0.65, + similarity_threshold: float = 0.65, filter_conditions: Dict[str, Any] = None ) -> List[Dict[str, Any]]: """ @@ -148,7 +148,7 @@ class VectorDatabaseService: Args: query_vector: Query vector to search for limit: Maximum number of results to return - score_threshold: Minimum similarity score threshold + similarity_threshold: Minimum similarity score threshold filter_conditions: Additional filter conditions Returns: @@ -172,14 +172,14 @@ class VectorDatabaseService: query_vector=query_vector, query_filter=search_filter, limit=limit, - score_threshold=score_threshold + score_threshold=similarity_threshold ) results = [] for hit in search_result: result = { "point_id": hit.id, - "score": hit.score, + "similarity_score": hit.score, "image_id": hit.payload.get("image_id"), "metadata": hit.payload } diff --git a/start_dev.sh b/start_dev.sh deleted file mode 100644 index 802bdc3..0000000 --- a/start_dev.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash - -# Development startup script for Sereact API -# This script sets the environment variables and starts the application -# Auto-generated by deployment/scripts/setup_local_env.sh - -# Function to handle cleanup on exit -cleanup() { - echo "" - echo "Shutting down server..." - if [ ! -z "$SERVER_PID" ]; then - kill $SERVER_PID 2>/dev/null - wait $SERVER_PID 2>/dev/null - fi - echo "Server stopped." - exit 0 -} - -# Set up signal handlers -trap cleanup SIGINT SIGTERM - -# Activate virtual environment -source venv/Scripts/activate - -# Set environment variables from deployed infrastructure -export QDRANT_HOST=34.171.134.17 -export QDRANT_PORT=6333 -export FIRESTORE_PROJECT_ID=gen-lang-client-0424120530 -export GCS_BUCKET_NAME=sereact-images -export ENVIRONMENT=development - -# Start the application -echo "Starting Sereact API with deployed infrastructure..." -echo "Qdrant endpoint: http://$QDRANT_HOST:$QDRANT_PORT" -echo "Firestore project: $FIRESTORE_PROJECT_ID" -echo "GCS bucket: $GCS_BUCKET_NAME" -echo "API will be available at: http://localhost:8000" -echo "API documentation: http://localhost:8000/docs" -echo "Press Ctrl+C to stop the server" -echo "" - -# Start uvicorn in background and capture PID -uvicorn main:app --host 0.0.0.0 --port 8000 --reload & -SERVER_PID=$! - -# Wait for the server process -wait $SERVER_PID diff --git a/test_threshold_fix.py b/test_threshold_fix.py deleted file mode 100644 index 506b9bc..0000000 --- a/test_threshold_fix.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -""" -Test script to verify that similarity threshold is properly handled -""" - -import json -from src.schemas.search import SearchRequest - -def test_threshold_handling(): - """Test that threshold values are properly handled in the schema""" - - # Test with threshold = 0 - test_data_zero = { - "query": "test query", - "threshold": 0.0, - "limit": 10 - } - - request_zero = SearchRequest(**test_data_zero) - print(f"Threshold 0.0 test: {request_zero.threshold}") - assert request_zero.threshold == 0.0, f"Expected 0.0, got {request_zero.threshold}" - - # Test with threshold = 0.5 - test_data_half = { - "query": "test query", - "threshold": 0.5, - "limit": 10 - } - - request_half = SearchRequest(**test_data_half) - print(f"Threshold 0.5 test: {request_half.threshold}") - assert request_half.threshold == 0.5, f"Expected 0.5, got {request_half.threshold}" - - # Test with threshold = 1.0 - test_data_one = { - "query": "test query", - "threshold": 1.0, - "limit": 10 - } - - request_one = SearchRequest(**test_data_one) - print(f"Threshold 1.0 test: {request_one.threshold}") - assert request_one.threshold == 1.0, f"Expected 1.0, got {request_one.threshold}" - - print("All threshold tests passed!") - -if __name__ == "__main__": - test_threshold_handling() \ No newline at end of file