diff --git a/README.md b/README.md index 7daf355..77708de 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,8 @@ SEREACT is a secure API for storing, organizing, and retrieving images with adva - API key authentication - **Asynchronous image processing with Pub/Sub and Cloud Functions** - **AI-powered image embeddings using Google Cloud Vision API** -- **Semantic search using vector similarity in Pinecone** +- **Semantic search using vector similarity with Qdrant Vector Database** +- **Self-hosted vector database on Google Compute Engine VM** - **Automatic retry mechanism for failed processing (up to 3 attempts)** - Metadata extraction and storage - Image processing capabilities @@ -25,6 +26,8 @@ sereact/ │ ├── cloud-function/ # **Cloud Function for image processing** │ ├── cloud-run/ # Google Cloud Run configuration │ └── terraform/ # Infrastructure as code + │ ├── vm.tf # **Vector database VM configuration** + │ └── scripts/ # **VM installation scripts** ├── docs/ # Documentation │ ├── api/ # API documentation │ └── TESTING.md # Comprehensive testing guide @@ -41,7 +44,8 @@ sereact/ │ ├── models/ # Database models │ ├── schemas/ # API request/response schemas │ ├── services/ # Business logic services - │ │ └── pubsub_service.py # **Pub/Sub message publishing** + │ │ ├── pubsub_service.py # **Pub/Sub message publishing** + │ │ └── vector_db.py # **Qdrant vector database service** │ └── utils/ # Utility functions ├── tests/ # Test code │ ├── api/ # API tests @@ -79,9 +83,9 @@ sereact/ ▼ ┌─────────────┐ ┌─────────────┐ │ │ │ │ - │ Cloud │ │ Pinecone │ + │ Cloud │ │ Qdrant │ │ Vision API │────────▶│ Vector DB │ - │ │ │ │ + │ │ │ (VM) │ └─────────────┘ └─────────────┘ ``` @@ -98,7 +102,7 @@ sereact/ - Function updates image status to `"processing"` - **Function downloads image from Cloud Storage** - **Function calls Google Cloud Vision API to generate embeddings** - - **Embeddings are stored in Pinecone Vector Database** + - **Embeddings are stored in Qdrant Vector Database on dedicated VM** - **Firestore is updated with embedding info and status: "success"** ### 3. **Error Handling & Retry**: @@ -108,7 +112,7 @@ sereact/ ### 4. **Search Flow**: - Search queries processed by FastAPI backend - - Vector similarity search performed against Pinecone + - Vector similarity search performed against Qdrant VM - Results combined with metadata from Firestore ## Technology Stack @@ -119,16 +123,44 @@ sereact/ - **Google Pub/Sub** - Message queue for async processing - **Google Cloud Functions** - Serverless image processing - **Google Cloud Vision API** - AI-powered image analysis and embedding generation -- **Pinecone** - Vector database for semantic search +- **Qdrant** - Self-hosted vector database for semantic search (on Google Compute Engine VM) +- **Google Compute Engine** - VM hosting for vector database - **Pydantic** - Data validation +## **Vector Database Infrastructure** + +### **Qdrant Vector Database VM** + +The system includes a dedicated Google Compute Engine VM running Qdrant vector database: + +- **VM Specifications**: 2 vCPUs, 8GB RAM, 50GB disk (e2-standard-2) +- **Operating System**: Ubuntu 22.04 LTS +- **Vector Database**: Qdrant (latest version via Docker) +- **Ports**: 6333 (HTTP API), 6334 (gRPC API) +- **Features**: + - Automatic installation and configuration via startup script + - Daily automated backups + - Health monitoring + - Firewall configuration + - Optional static IP assignment + - API key authentication support + +### **Vector Database Features** + +- **High Performance**: Optimized for image vector similarity search +- **Scalable**: Can handle millions of image vectors +- **Persistent Storage**: Data persisted on VM disk with automated backups +- **RESTful API**: Easy integration with Python client +- **Cosine Similarity**: Optimized for image embedding comparisons +- **Metadata Filtering**: Support for complex search filters + ## Setup and Installation ### Prerequisites - Python 3.8+ -- Google Cloud account with Firestore, Storage, Pub/Sub, Cloud Functions, and Vision API enabled -- Pinecone account for vector database +- Google Cloud account with Firestore, Storage, Pub/Sub, Cloud Functions, Compute Engine, and Vision API enabled +- Terraform (for infrastructure deployment) ### Installation @@ -170,34 +202,79 @@ sereact/ # Security API_KEY_SECRET=your-secret-key - # Vector database (Pinecone) - VECTOR_DB_API_KEY=your-pinecone-api-key - VECTOR_DB_ENVIRONMENT=your-pinecone-environment - VECTOR_DB_INDEX_NAME=image-embeddings + # Vector database (Qdrant) + QDRANT_HOST=your-vm-external-ip + QDRANT_API_KEY=your-qdrant-api-key # Optional ``` -5. **Deploy Infrastructure** (Optional - for production): +5. **Deploy Infrastructure** (Required for vector database): ```bash - # Deploy Pub/Sub infrastructure with Terraform + # Configure Terraform variables cd deployment/terraform + cp terraform.tfvars.example terraform.tfvars + # Edit terraform.tfvars with your values + + # Deploy infrastructure including vector database VM terraform init terraform plan terraform apply - # Deploy Cloud Function + # Note the output values for VM IP addresses + ``` + +6. **Deploy Cloud Function** (Optional - for production): + ```bash cd ../cloud-function ./deploy.sh ``` -6. Run the application: +7. Run the application: ```bash uvicorn main:app --reload ``` -7. Visit `http://localhost:8000/docs` in your browser to access the API documentation. +8. Visit `http://localhost:8000/docs` in your browser to access the API documentation. ## **Deployment** +### **Complete Infrastructure Deployment** + +Deploy the entire infrastructure including the vector database VM: + +```bash +cd deployment/terraform + +# Configure your variables +cp terraform.tfvars.example terraform.tfvars +# Edit terraform.tfvars with your specific values: +# - project_id: Your GCP project ID +# - storage_bucket_name: Unique bucket name +# - qdrant_api_key: Secure API key for Qdrant (optional) +# - allowed_cidr_blocks: Your IP address/range for security +# - use_static_ip: Set to true for production + +# Deploy infrastructure +terraform init +terraform plan +terraform apply +``` + +This will create: +- **Google Compute Engine VM with Qdrant vector database** +- **Firewall rules for vector database access** +- **Service accounts and IAM bindings** +- **Pub/Sub topic and subscription with retry policy** +- **Cloud Storage bucket** +- **Firestore database** +- **Cloud Run service** + +### **Vector Database VM Outputs** + +After deployment, Terraform will output: +- `vector_db_vm_external_ip`: External IP address of the VM +- `qdrant_http_endpoint`: HTTP API endpoint for Qdrant +- `qdrant_grpc_endpoint`: gRPC API endpoint for Qdrant + ### **Cloud Function Deployment** The image processing Cloud Function can be deployed using the provided script: @@ -207,34 +284,59 @@ cd deployment/cloud-function # Set environment variables export GOOGLE_CLOUD_PROJECT=your-project-id -export PINECONE_API_KEY=your-pinecone-api-key -export PINECONE_ENVIRONMENT=your-pinecone-environment +export QDRANT_HOST=your-vm-external-ip +export QDRANT_API_KEY=your-qdrant-api-key # Deploy the function ./deploy.sh ``` -### **Infrastructure as Code** +### **Vector Database Management** -Use Terraform to deploy the complete infrastructure: +#### **Accessing the Vector Database** ```bash -cd deployment/terraform +# SSH into the VM +gcloud compute ssh sereact-vector-db --zone=us-central1-a -# Initialize Terraform -terraform init +# Check Qdrant status +sudo systemctl status qdrant -# Review the deployment plan -terraform plan +# View logs +sudo journalctl -u qdrant -f -# Deploy infrastructure -terraform apply +# Run health check +sudo /opt/qdrant/health_check.sh + +# Manual backup +sudo /opt/qdrant/backup.sh ``` -This will create: -- **Pub/Sub topic and subscription with retry policy** -- **Dead letter queue for failed messages** -- **IAM bindings for service accounts** +#### **Vector Database API Usage** + +```python +from src.services.vector_db import VectorDatabaseService + +# Initialize service +vector_db = VectorDatabaseService( + host="your-vm-external-ip", + api_key="your-qdrant-api-key" # Optional +) + +# Add image vector +point_id = vector_db.add_image_vector( + image_id="img_123", + vector=[0.1, 0.2, ...], # 512-dimensional vector + metadata={"filename": "image.jpg", "size": 1024} +) + +# Search similar images +results = vector_db.search_similar_images( + query_vector=[0.1, 0.2, ...], + limit=10, + score_threshold=0.7 +) +``` ## API Endpoints @@ -244,7 +346,7 @@ The API provides the following main endpoints: - `/api/v1/teams/*` - Team management - `/api/v1/users/*` - User management - `/api/v1/images/*` - **Image upload, download, and management (with async processing)** -- `/api/v1/search/*` - **Image search functionality (semantic search)** +- `/api/v1/search/*` - **Image search functionality (semantic search via Qdrant)** ### **Image Processing Status** @@ -273,8 +375,9 @@ pytest # Run specific test categories pytest tests/services/test_pubsub_service.py # Pub/Sub service tests +pytest tests/services/test_vector_db.py # Vector database tests pytest tests/integration/test_cloud_function.py # Cloud Function tests -pytest tests/api/test_images_pubsub.py # API integration tests +pytest tests/api/test_images_pubsub.py # API integration tests ``` ### **Comprehensive End-to-End Testing** @@ -292,6 +395,26 @@ python scripts/run_tests.py unit python scripts/run_tests.py integration ``` +## **Infrastructure Costs** + +### **Estimated Monthly Costs (USD)** + +- **Compute Engine VM (e2-standard-2)**: ~$50-70/month +- **Cloud Storage**: $0.02/GB/month +- **Firestore**: $0.18/100K reads, $0.18/100K writes +- **Pub/Sub**: $0.40/million messages +- **Cloud Functions**: $0.40/million invocations +- **Cloud Vision API**: $1.50/1000 images + +**Total estimated cost for moderate usage**: ~$60-100/month + +### **Cost Optimization Tips** + +- Use preemptible VM instances for development (50-91% cost reduction) +- Set up automatic VM shutdown during off-hours +- Use regional persistent disks instead of SSD for cost savings +- Monitor and set up billing alerts + ## License This project is licensed under the MIT License - see the LICENSE file for details. @@ -308,6 +431,7 @@ src/ ├── config/ # Configuration management ├── models/ # Database models ├── services/ # Business logic services + │ └── vector_db.py # **Qdrant vector database service** └── utils/ # Utility functions ``` @@ -331,7 +455,7 @@ src/ - Contains core business logic - Orchestrates operations across multiple resources - Implements domain-specific rules and workflows -- Integrates with external services (Cloud Vision, Storage) +- Integrates with external services (Cloud Vision, Storage, **Qdrant**) - Handles image processing and embedding generation #### Models Module @@ -392,6 +516,7 @@ The modules interact in the following ways: - Auth Module validates the request authentication - Router delegates to appropriate Service functions - Service uses Models to interact with the database + - **Service integrates with Qdrant Vector Database for similarity search** - Service returns data to Router which formats the response - **Cross-Cutting Concerns**: diff --git a/deployment/cloud-function/deploy.sh b/deployment/cloud-function/deploy.sh index 2051cb6..2223a52 100644 --- a/deployment/cloud-function/deploy.sh +++ b/deployment/cloud-function/deploy.sh @@ -13,9 +13,10 @@ MEMORY=${MEMORY:-"512MB"} TIMEOUT=${TIMEOUT:-"540s"} # Environment variables for the function -PINECONE_API_KEY=${PINECONE_API_KEY:-""} -PINECONE_ENVIRONMENT=${PINECONE_ENVIRONMENT:-""} -PINECONE_INDEX_NAME=${PINECONE_INDEX_NAME:-"image-embeddings"} +QDRANT_HOST=${QDRANT_HOST:-""} +QDRANT_PORT=${QDRANT_PORT:-"6333"} +QDRANT_API_KEY=${QDRANT_API_KEY:-""} +QDRANT_COLLECTION=${QDRANT_COLLECTION:-"image_vectors"} echo "Deploying Cloud Function: $FUNCTION_NAME" echo "Project: $PROJECT_ID" @@ -23,12 +24,9 @@ echo "Region: $REGION" echo "Pub/Sub Topic: $PUBSUB_TOPIC" # Check if required environment variables are set -if [ -z "$PINECONE_API_KEY" ]; then - echo "Warning: PINECONE_API_KEY not set. Function will not store embeddings." -fi - -if [ -z "$PINECONE_ENVIRONMENT" ]; then - echo "Warning: PINECONE_ENVIRONMENT not set. Function will not store embeddings." +if [ -z "$QDRANT_HOST" ]; then + echo "Warning: QDRANT_HOST not set. Function will not store embeddings." + echo "Please set QDRANT_HOST to your vector database VM's external IP address." fi # Deploy the function @@ -41,7 +39,7 @@ gcloud functions deploy $FUNCTION_NAME \ --trigger-topic=$PUBSUB_TOPIC \ --memory=$MEMORY \ --timeout=$TIMEOUT \ - --set-env-vars="PINECONE_API_KEY=$PINECONE_API_KEY,PINECONE_ENVIRONMENT=$PINECONE_ENVIRONMENT,PINECONE_INDEX_NAME=$PINECONE_INDEX_NAME" \ + --set-env-vars="QDRANT_HOST=$QDRANT_HOST,QDRANT_PORT=$QDRANT_PORT,QDRANT_API_KEY=$QDRANT_API_KEY,QDRANT_COLLECTION=$QDRANT_COLLECTION" \ --retry \ --max-instances=10 \ --min-instances=0 @@ -50,6 +48,7 @@ echo "Cloud Function deployed successfully!" echo "Function name: $FUNCTION_NAME" echo "Trigger: Pub/Sub topic '$PUBSUB_TOPIC'" echo "Region: $REGION" +echo "Qdrant Host: $QDRANT_HOST" # Set up retry policy for the Pub/Sub subscription SUBSCRIPTION_NAME="${PUBSUB_TOPIC}-subscription" diff --git a/deployment/cloud-function/main.py b/deployment/cloud-function/main.py index 046117a..b17af6f 100644 --- a/deployment/cloud-function/main.py +++ b/deployment/cloud-function/main.py @@ -7,11 +7,14 @@ import functions_framework from google.cloud import vision from google.cloud import firestore from google.cloud import storage -import pinecone +from qdrant_client import QdrantClient +from qdrant_client.http import models +from qdrant_client.http.models import Distance, VectorParams, PointStruct import numpy as np from PIL import Image import io import os +import uuid # Configure logging logging.basicConfig(level=logging.INFO) @@ -22,17 +25,47 @@ vision_client = vision.ImageAnnotatorClient() firestore_client = firestore.Client() storage_client = storage.Client() -# Initialize Pinecone -PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY') -PINECONE_ENVIRONMENT = os.environ.get('PINECONE_ENVIRONMENT') -PINECONE_INDEX_NAME = os.environ.get('PINECONE_INDEX_NAME', 'image-embeddings') +# Initialize Qdrant +QDRANT_HOST = os.environ.get('QDRANT_HOST', 'localhost') +QDRANT_PORT = int(os.environ.get('QDRANT_PORT', '6333')) +QDRANT_API_KEY = os.environ.get('QDRANT_API_KEY') +QDRANT_COLLECTION = os.environ.get('QDRANT_COLLECTION', 'image_vectors') -if PINECONE_API_KEY and PINECONE_ENVIRONMENT: - pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENVIRONMENT) - index = pinecone.Index(PINECONE_INDEX_NAME) -else: - index = None - logger.warning("Pinecone not configured, embeddings will not be stored") +try: + qdrant_client = QdrantClient( + host=QDRANT_HOST, + port=QDRANT_PORT, + api_key=QDRANT_API_KEY + ) + + # Ensure collection exists + try: + collections = qdrant_client.get_collections() + collection_names = [col.name for col in collections.collections] + + if QDRANT_COLLECTION not in collection_names: + logger.info(f"Creating Qdrant collection: {QDRANT_COLLECTION}") + qdrant_client.create_collection( + collection_name=QDRANT_COLLECTION, + vectors_config=VectorParams( + size=512, # Fixed size for image embeddings + distance=Distance.COSINE + ) + ) + logger.info(f"Collection {QDRANT_COLLECTION} created successfully") + else: + logger.info(f"Collection {QDRANT_COLLECTION} already exists") + + except Exception as e: + logger.error(f"Error ensuring Qdrant collection exists: {e}") + qdrant_client = None + +except Exception as e: + logger.error(f"Failed to initialize Qdrant client: {e}") + qdrant_client = None + +if not qdrant_client: + logger.warning("Qdrant not configured, embeddings will not be stored") @functions_framework.cloud_event def process_image_embedding(cloud_event): @@ -121,25 +154,36 @@ def process_image(image_id: str, storage_path: str, team_id: str, retry_count: i logger.error(f"Failed to generate embeddings for image {image_id}") return False - # Store embeddings in Pinecone - if index: - embedding_id = f"{team_id}_{image_id}" + # Store embeddings in Qdrant + if qdrant_client: + point_id = str(uuid.uuid4()) # Prepare metadata metadata = { 'image_id': image_id, 'team_id': team_id, 'storage_path': storage_path, - 'created_at': datetime.utcnow().isoformat() + 'created_at': datetime.utcnow().isoformat(), + 'model': 'google-vision-v1' } - # Upsert to Pinecone - index.upsert(vectors=[(embedding_id, embeddings.tolist(), metadata)]) + # Create point for Qdrant + point = PointStruct( + id=point_id, + vector=embeddings.tolist(), + payload=metadata + ) - logger.info(f"Stored embeddings for image {image_id} in Pinecone") + # Upsert to Qdrant + qdrant_client.upsert( + collection_name=QDRANT_COLLECTION, + points=[point] + ) + + logger.info(f"Stored embeddings for image {image_id} in Qdrant with point ID {point_id}") # Update Firestore with embedding info - update_image_embedding_info(image_id, embedding_id, 'google-vision-v1') + update_image_embedding_info(image_id, point_id, 'google-vision-v1') return True @@ -193,6 +237,39 @@ def generate_image_embeddings(image_data: bytes) -> Optional[np.ndarray]: label.score # Confidence score ]) + # Get text detection for additional context + text_response = vision_client.text_detection(image=image) + + if text_response.text_annotations: + # Add text features + text_content = text_response.text_annotations[0].description if text_response.text_annotations else "" + text_hash = hash(text_content.lower()) % 1000 / 1000.0 + features.extend([text_hash, len(text_content) / 1000.0]) # Normalized text length + + # Get face detection for additional features + face_response = vision_client.face_detection(image=image) + + face_count = len(face_response.face_annotations) + features.append(min(face_count / 10.0, 1.0)) # Normalized face count + + # Add image properties + try: + # Get image properties + properties_response = vision_client.image_properties(image=image) + + if properties_response.image_properties_annotation: + # Add dominant colors as features + colors = properties_response.image_properties_annotation.dominant_colors.colors + for i, color in enumerate(colors[:5]): # Top 5 colors + features.extend([ + color.color.red / 255.0, + color.color.green / 255.0, + color.color.blue / 255.0, + color.score + ]) + except Exception as e: + logger.warning(f"Could not extract image properties: {e}") + # Pad or truncate to fixed size (512 dimensions) target_size = 512 if len(features) < target_size: @@ -200,7 +277,13 @@ def generate_image_embeddings(image_data: bytes) -> Optional[np.ndarray]: else: features = features[:target_size] - return np.array(features, dtype=np.float32) + # Normalize the feature vector + features_array = np.array(features, dtype=np.float32) + norm = np.linalg.norm(features_array) + if norm > 0: + features_array = features_array / norm + + return features_array except Exception as e: logger.error(f"Error generating embeddings: {e}") @@ -238,20 +321,20 @@ def update_image_status(image_id: str, status: str, retry_count: int, error_mess except Exception as e: logger.error(f"Error updating image status: {e}") -def update_image_embedding_info(image_id: str, embedding_id: str, model: str): +def update_image_embedding_info(image_id: str, point_id: str, model: str): """ Update the image with embedding information Args: image_id: The ID of the image - embedding_id: The ID of the embedding in the vector database + point_id: The ID of the point in the Qdrant vector database model: The model used to generate embeddings """ try: doc_ref = firestore_client.collection('images').document(image_id) update_data = { - 'embedding_id': embedding_id, + 'embedding_point_id': point_id, 'embedding_model': model, 'has_embedding': True } diff --git a/deployment/cloud-function/requirements.txt b/deployment/cloud-function/requirements.txt index eea828c..118bedb 100644 --- a/deployment/cloud-function/requirements.txt +++ b/deployment/cloud-function/requirements.txt @@ -2,6 +2,6 @@ functions-framework==3.4.0 google-cloud-vision==3.4.5 google-cloud-firestore==2.11.1 google-cloud-storage==2.12.0 -pinecone-client==2.2.4 +qdrant-client==1.7.0 numpy==1.24.3 Pillow==10.1.0 \ No newline at end of file diff --git a/deployment/terraform/main.tf b/deployment/terraform/main.tf index bbc3fdd..d3cd701 100644 --- a/deployment/terraform/main.tf +++ b/deployment/terraform/main.tf @@ -11,7 +11,8 @@ resource "google_project_service" "services" { "containerregistry.googleapis.com", "run.googleapis.com", "firestore.googleapis.com", - "storage.googleapis.com" + "storage.googleapis.com", + "compute.googleapis.com" ]) project = var.project_id diff --git a/deployment/terraform/outputs.tf b/deployment/terraform/outputs.tf index ef8f87e..a9e602c 100644 --- a/deployment/terraform/outputs.tf +++ b/deployment/terraform/outputs.tf @@ -16,4 +16,35 @@ output "firestore_database_id" { output "container_registry_url" { value = "gcr.io/${var.project_id}/sereact" description = "The URL of the Container Registry repository" +} + +# Vector Database VM outputs +output "vector_db_vm_name" { + value = google_compute_instance.vector_db_vm.name + description = "The name of the vector database VM" +} + +output "vector_db_vm_external_ip" { + value = google_compute_instance.vector_db_vm.network_interface[0].access_config[0].nat_ip + description = "The external IP address of the vector database VM" +} + +output "vector_db_vm_internal_ip" { + value = google_compute_instance.vector_db_vm.network_interface[0].network_ip + description = "The internal IP address of the vector database VM" +} + +output "vector_db_static_ip" { + value = var.use_static_ip ? google_compute_address.vector_db_static_ip.address : null + description = "The static IP address of the vector database VM (if enabled)" +} + +output "qdrant_http_endpoint" { + value = "http://${google_compute_instance.vector_db_vm.network_interface[0].access_config[0].nat_ip}:6333" + description = "The HTTP endpoint for Qdrant vector database" +} + +output "qdrant_grpc_endpoint" { + value = "http://${google_compute_instance.vector_db_vm.network_interface[0].access_config[0].nat_ip}:6334" + description = "The gRPC endpoint for Qdrant vector database" } \ No newline at end of file diff --git a/deployment/terraform/scripts/install_qdrant.sh b/deployment/terraform/scripts/install_qdrant.sh new file mode 100644 index 0000000..9fa132c --- /dev/null +++ b/deployment/terraform/scripts/install_qdrant.sh @@ -0,0 +1,189 @@ +#!/bin/bash + +# Qdrant Vector Database Installation Script +# This script installs and configures Qdrant on Ubuntu 22.04 + +set -e + +# Update system packages +apt-get update +apt-get upgrade -y + +# Install required packages +apt-get install -y curl wget gnupg2 software-properties-common apt-transport-https ca-certificates + +# Install Docker +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg +echo "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null +apt-get update +apt-get install -y docker-ce docker-ce-cli containerd.io + +# Start and enable Docker +systemctl start docker +systemctl enable docker + +# Create qdrant user and directories +useradd -r -s /bin/false qdrant || true +mkdir -p /opt/qdrant/storage +mkdir -p /opt/qdrant/config +chown -R qdrant:qdrant /opt/qdrant + +# Create Qdrant configuration file +cat > /opt/qdrant/config/config.yaml << EOF +service: + host: 0.0.0.0 + http_port: 6333 + grpc_port: 6334 + enable_cors: true + +storage: + storage_path: /qdrant/storage + snapshots_path: /qdrant/snapshots + on_disk_payload: true + +cluster: + enabled: false + +telemetry: + disabled: true + +log_level: INFO +EOF + +# Create API key configuration if provided +if [ -n "${qdrant_api_key}" ] && [ "${qdrant_api_key}" != "" ]; then +cat >> /opt/qdrant/config/config.yaml << EOF + +service: + api_key: "${qdrant_api_key}" +EOF +fi + +# Create systemd service for Qdrant +cat > /etc/systemd/system/qdrant.service << EOF +[Unit] +Description=Qdrant Vector Database +After=docker.service +Requires=docker.service + +[Service] +Type=simple +User=root +ExecStartPre=-/usr/bin/docker stop qdrant +ExecStartPre=-/usr/bin/docker rm qdrant +ExecStart=/usr/bin/docker run --name qdrant \ + -p 6333:6333 \ + -p 6334:6334 \ + -v /opt/qdrant/storage:/qdrant/storage:z \ + -v /opt/qdrant/config/config.yaml:/qdrant/config/production.yaml:z \ + qdrant/qdrant:latest +ExecStop=/usr/bin/docker stop qdrant +Restart=always +RestartSec=10 + +[Install] +WantedBy=multi-user.target +EOF + +# Pull Qdrant Docker image +docker pull qdrant/qdrant:latest + +# Enable and start Qdrant service +systemctl daemon-reload +systemctl enable qdrant +systemctl start qdrant + +# Install monitoring tools +apt-get install -y htop iotop nethogs + +# Create a simple health check script +cat > /opt/qdrant/health_check.sh << 'EOF' +#!/bin/bash +response=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:6333/health) +if [ "$response" = "200" ]; then + echo "Qdrant is healthy" + exit 0 +else + echo "Qdrant is not responding properly (HTTP $response)" + exit 1 +fi +EOF + +chmod +x /opt/qdrant/health_check.sh + +# Set up log rotation for Docker logs +cat > /etc/logrotate.d/docker << EOF +/var/lib/docker/containers/*/*.log { + rotate 7 + daily + compress + size=1M + missingok + delaycompress + copytruncate +} +EOF + +# Configure firewall (ufw) +ufw --force enable +ufw allow ssh +ufw allow 6333/tcp # Qdrant HTTP API +ufw allow 6334/tcp # Qdrant gRPC API + +# Create a simple backup script +cat > /opt/qdrant/backup.sh << 'EOF' +#!/bin/bash +BACKUP_DIR="/opt/qdrant/backups" +DATE=$(date +%Y%m%d_%H%M%S) +mkdir -p $BACKUP_DIR + +# Create snapshot via API +curl -X POST "http://localhost:6333/snapshots" \ + -H "Content-Type: application/json" \ + -d '{"snapshot_name": "backup_'$DATE'"}' + +# Copy storage directory +tar -czf $BACKUP_DIR/qdrant_storage_$DATE.tar.gz -C /opt/qdrant storage/ + +# Keep only last 7 backups +find $BACKUP_DIR -name "*.tar.gz" -mtime +7 -delete + +echo "Backup completed: $DATE" +EOF + +chmod +x /opt/qdrant/backup.sh + +# Set up daily backup cron job +echo "0 2 * * * root /opt/qdrant/backup.sh >> /var/log/qdrant_backup.log 2>&1" >> /etc/crontab + +# Wait for Qdrant to be ready +echo "Waiting for Qdrant to start..." +for i in {1..30}; do + if curl -s http://localhost:6333/health > /dev/null; then + echo "Qdrant is ready!" + break + fi + echo "Waiting... ($i/30)" + sleep 10 +done + +# Create a default collection for image vectors +curl -X PUT "http://localhost:6333/collections/image_vectors" \ + -H "Content-Type: application/json" \ + -d '{ + "vectors": { + "size": 512, + "distance": "Cosine" + }, + "optimizers_config": { + "default_segment_number": 2 + }, + "replication_factor": 1 + }' + +echo "Qdrant installation and configuration completed!" +echo "Qdrant is accessible at:" +echo " HTTP API: http://$(curl -s ifconfig.me):6333" +echo " gRPC API: http://$(curl -s ifconfig.me):6334" +echo "Health check: /opt/qdrant/health_check.sh" +echo "Backup script: /opt/qdrant/backup.sh" \ No newline at end of file diff --git a/deployment/terraform/terraform.tfvars.example b/deployment/terraform/terraform.tfvars.example index dfbd2d1..2916e6d 100644 --- a/deployment/terraform/terraform.tfvars.example +++ b/deployment/terraform/terraform.tfvars.example @@ -2,4 +2,9 @@ project_id = "your-gcp-project-id" region = "us-central1" zone = "us-central1-a" storage_bucket_name = "your-app-storage-bucket" -firestore_db_name = "imagedb" \ No newline at end of file +firestore_db_name = "imagedb" + +# Vector Database Configuration +qdrant_api_key = "your-secure-api-key-here" # Optional: leave empty for no authentication +allowed_cidr_blocks = "YOUR_IP_ADDRESS/32" # Replace with your IP or network range +use_static_ip = false # Set to true if you want a static IP \ No newline at end of file diff --git a/deployment/terraform/variables.tf b/deployment/terraform/variables.tf index ea8d231..ecf1913 100644 --- a/deployment/terraform/variables.tf +++ b/deployment/terraform/variables.tf @@ -48,4 +48,24 @@ variable "cloud_function_service_account" { description = "The service account email for Cloud Functions" type = string default = "" +} + +# Vector Database VM variables +variable "qdrant_api_key" { + description = "API key for Qdrant vector database" + type = string + sensitive = true + default = "" +} + +variable "allowed_cidr_blocks" { + description = "CIDR blocks allowed to access the vector database" + type = string + default = "0.0.0.0/0" # Change this to your specific IP ranges for security +} + +variable "use_static_ip" { + description = "Whether to use a static IP for the vector database VM" + type = bool + default = false } \ No newline at end of file diff --git a/deployment/terraform/vm.tf b/deployment/terraform/vm.tf new file mode 100644 index 0000000..1d9a31a --- /dev/null +++ b/deployment/terraform/vm.tf @@ -0,0 +1,104 @@ +# VM instance for vector database +resource "google_compute_instance" "vector_db_vm" { + name = "sereact-vector-db" + machine_type = "e2-standard-2" # 2 vCPUs, 8GB RAM + zone = var.zone + + boot_disk { + initialize_params { + image = "ubuntu-os-cloud/ubuntu-2204-lts" + size = 50 # 50GB disk + type = "pd-standard" + } + } + + network_interface { + network = "default" + access_config { + # Ephemeral public IP + } + } + + # Startup script to install and configure Qdrant + metadata_startup_script = templatefile("${path.module}/scripts/install_qdrant.sh", { + qdrant_api_key = var.qdrant_api_key + }) + + # Service account for the VM + service_account { + email = google_service_account.vector_db_sa.email + scopes = ["cloud-platform"] + } + + # Tags for firewall rules + tags = ["vector-db", "qdrant"] + + depends_on = [google_project_service.services] +} + +# Service account for the vector DB VM +resource "google_service_account" "vector_db_sa" { + account_id = "vector-db-sa" + display_name = "Vector Database Service Account" + description = "Service account for the vector database VM" +} + +# Firewall rule to allow Qdrant access +resource "google_compute_firewall" "qdrant_firewall" { + name = "allow-qdrant" + network = "default" + + allow { + protocol = "tcp" + ports = ["6333", "6334"] # Qdrant HTTP and gRPC ports + } + + source_ranges = [ + "10.0.0.0/8", # Internal GCP networks + var.allowed_cidr_blocks # Your specified IP ranges + ] + + target_tags = ["qdrant"] +} + +# Static IP for the vector DB VM (optional but recommended) +resource "google_compute_address" "vector_db_static_ip" { + name = "vector-db-static-ip" + region = var.region +} + +# Attach the static IP to the VM +resource "google_compute_instance" "vector_db_vm_with_static_ip" { + count = var.use_static_ip ? 1 : 0 + name = "sereact-vector-db-static" + machine_type = "e2-standard-2" + zone = var.zone + + boot_disk { + initialize_params { + image = "ubuntu-os-cloud/ubuntu-2204-lts" + size = 50 + type = "pd-standard" + } + } + + network_interface { + network = "default" + access_config { + nat_ip = google_compute_address.vector_db_static_ip.address + } + } + + metadata_startup_script = templatefile("${path.module}/scripts/install_qdrant.sh", { + qdrant_api_key = var.qdrant_api_key + }) + + service_account { + email = google_service_account.vector_db_sa.email + scopes = ["cloud-platform"] + } + + tags = ["vector-db", "qdrant"] + + depends_on = [google_project_service.services] +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4decad2..15372a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,8 +13,8 @@ passlib==1.7.4 tenacity==8.2.3 pytest==7.4.3 httpx==0.25.1 -pinecone-client==2.2.4 pillow==10.1.0 python-slugify==8.0.1 email-validator==2.1.0.post1 pymongo==4.5.0 +qdrant-client==1.7.0 diff --git a/src/services/vector_db.py b/src/services/vector_db.py new file mode 100644 index 0000000..b0ce3b5 --- /dev/null +++ b/src/services/vector_db.py @@ -0,0 +1,318 @@ +""" +Vector Database Service for handling image vectors using Qdrant. +""" + +import os +import logging +from typing import List, Dict, Any, Optional, Tuple +import numpy as np +from qdrant_client import QdrantClient +from qdrant_client.http import models +from qdrant_client.http.models import Distance, VectorParams, PointStruct +import uuid + +logger = logging.getLogger(__name__) + + +class VectorDatabaseService: + """Service for managing image vectors in Qdrant vector database.""" + + def __init__( + self, + host: str = None, + port: int = 6333, + api_key: str = None, + collection_name: str = "image_vectors" + ): + """ + Initialize the vector database service. + + Args: + host: Qdrant server host + port: Qdrant server port + api_key: API key for authentication (optional) + collection_name: Name of the collection to use + """ + self.host = host or os.getenv("QDRANT_HOST", "localhost") + self.port = port + self.api_key = api_key or os.getenv("QDRANT_API_KEY") + self.collection_name = collection_name + + # Initialize Qdrant client + self.client = QdrantClient( + host=self.host, + port=self.port, + api_key=self.api_key + ) + + # Ensure collection exists + self._ensure_collection_exists() + + def _ensure_collection_exists(self): + """Ensure the collection exists, create if it doesn't.""" + try: + collections = self.client.get_collections() + collection_names = [col.name for col in collections.collections] + + if self.collection_name not in collection_names: + logger.info(f"Creating collection: {self.collection_name}") + self.client.create_collection( + collection_name=self.collection_name, + vectors_config=VectorParams( + size=512, # Typical size for image embeddings + distance=Distance.COSINE + ) + ) + logger.info(f"Collection {self.collection_name} created successfully") + else: + logger.info(f"Collection {self.collection_name} already exists") + + except Exception as e: + logger.error(f"Error ensuring collection exists: {e}") + raise + + def add_image_vector( + self, + image_id: str, + vector: List[float], + metadata: Dict[str, Any] = None + ) -> str: + """ + Add an image vector to the database. + + Args: + image_id: Unique identifier for the image + vector: Image embedding vector + metadata: Additional metadata for the image + + Returns: + Point ID in the vector database + """ + try: + point_id = str(uuid.uuid4()) + payload = { + "image_id": image_id, + "timestamp": metadata.get("timestamp") if metadata else None, + "filename": metadata.get("filename") if metadata else None, + "size": metadata.get("size") if metadata else None, + "format": metadata.get("format") if metadata else None, + **(metadata or {}) + } + + point = PointStruct( + id=point_id, + vector=vector, + payload=payload + ) + + self.client.upsert( + collection_name=self.collection_name, + points=[point] + ) + + logger.info(f"Added vector for image {image_id} with point ID {point_id}") + return point_id + + except Exception as e: + logger.error(f"Error adding image vector: {e}") + raise + + def search_similar_images( + self, + query_vector: List[float], + limit: int = 10, + score_threshold: float = 0.7, + filter_conditions: Dict[str, Any] = None + ) -> List[Dict[str, Any]]: + """ + Search for similar images based on vector similarity. + + Args: + query_vector: Query vector to search for + limit: Maximum number of results to return + score_threshold: Minimum similarity score threshold + filter_conditions: Additional filter conditions + + Returns: + List of similar images with scores and metadata + """ + try: + search_filter = None + if filter_conditions: + search_filter = models.Filter( + must=[ + models.FieldCondition( + key=key, + match=models.MatchValue(value=value) + ) + for key, value in filter_conditions.items() + ] + ) + + search_result = self.client.search( + collection_name=self.collection_name, + query_vector=query_vector, + query_filter=search_filter, + limit=limit, + score_threshold=score_threshold + ) + + results = [] + for hit in search_result: + result = { + "point_id": hit.id, + "score": hit.score, + "image_id": hit.payload.get("image_id"), + "metadata": hit.payload + } + results.append(result) + + logger.info(f"Found {len(results)} similar images") + return results + + except Exception as e: + logger.error(f"Error searching similar images: {e}") + raise + + def get_image_vector(self, image_id: str) -> Optional[Dict[str, Any]]: + """ + Get vector and metadata for a specific image. + + Args: + image_id: Image identifier + + Returns: + Vector data and metadata if found, None otherwise + """ + try: + search_result = self.client.scroll( + collection_name=self.collection_name, + scroll_filter=models.Filter( + must=[ + models.FieldCondition( + key="image_id", + match=models.MatchValue(value=image_id) + ) + ] + ), + limit=1, + with_vectors=True + ) + + if search_result[0]: # search_result is a tuple (points, next_page_offset) + point = search_result[0][0] + return { + "point_id": point.id, + "vector": point.vector, + "image_id": point.payload.get("image_id"), + "metadata": point.payload + } + + return None + + except Exception as e: + logger.error(f"Error getting image vector: {e}") + raise + + def delete_image_vector(self, image_id: str) -> bool: + """ + Delete vector for a specific image. + + Args: + image_id: Image identifier + + Returns: + True if deleted successfully, False otherwise + """ + try: + # First find the point ID + search_result = self.client.scroll( + collection_name=self.collection_name, + scroll_filter=models.Filter( + must=[ + models.FieldCondition( + key="image_id", + match=models.MatchValue(value=image_id) + ) + ] + ), + limit=1 + ) + + if search_result[0]: + point_id = search_result[0][0].id + self.client.delete( + collection_name=self.collection_name, + points_selector=models.PointIdsList( + points=[point_id] + ) + ) + logger.info(f"Deleted vector for image {image_id}") + return True + + logger.warning(f"No vector found for image {image_id}") + return False + + except Exception as e: + logger.error(f"Error deleting image vector: {e}") + raise + + def get_collection_info(self) -> Dict[str, Any]: + """ + Get information about the collection. + + Returns: + Collection information including count and configuration + """ + try: + collection_info = self.client.get_collection(self.collection_name) + return { + "name": collection_info.config.params.vectors.size, + "vectors_count": collection_info.points_count, + "vector_size": collection_info.config.params.vectors.size, + "distance": collection_info.config.params.vectors.distance, + "status": collection_info.status + } + + except Exception as e: + logger.error(f"Error getting collection info: {e}") + raise + + def health_check(self) -> bool: + """ + Check if the vector database is healthy. + + Returns: + True if healthy, False otherwise + """ + try: + collections = self.client.get_collections() + return True + except Exception as e: + logger.error(f"Vector database health check failed: {e}") + return False + + +# Utility functions for vector operations +def normalize_vector(vector: List[float]) -> List[float]: + """Normalize a vector to unit length.""" + vector_array = np.array(vector) + norm = np.linalg.norm(vector_array) + if norm == 0: + return vector + return (vector_array / norm).tolist() + + +def cosine_similarity(vector1: List[float], vector2: List[float]) -> float: + """Calculate cosine similarity between two vectors.""" + v1 = np.array(vector1) + v2 = np.array(vector2) + + dot_product = np.dot(v1, v2) + norm_v1 = np.linalg.norm(v1) + norm_v2 = np.linalg.norm(v2) + + if norm_v1 == 0 or norm_v2 == 0: + return 0.0 + + return dot_product / (norm_v1 * norm_v2) \ No newline at end of file