# vector_search.py
import os
import numpy as np
import faiss
import pickle
from sentence_transformers import SentenceTransformer
from typing import List, Tuple, Dict
import json
# Configure paths on Archil disk
INDICES_PATH = "/mnt/archil/faiss/indices"
EMBEDDINGS_PATH = "/mnt/archil/faiss/embeddings"
DATA_PATH = "/mnt/archil/faiss/data"
MODELS_PATH = "/mnt/archil/faiss/models"
class ArchilVectorSearch:
def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
# Initialize embedding model (cached on Archil disk)
self.model = SentenceTransformer(
model_name,
cache_folder=MODELS_PATH
)
self.dimension = self.model.get_sentence_embedding_dimension()
# Initialize FAISS index
self.index = None
self.documents = []
self.metadata = []
def create_index(self, index_type="flat"):
"""Create a new FAISS index"""
if index_type == "flat":
# Exact search (L2 distance)
self.index = faiss.IndexFlatL2(self.dimension)
elif index_type == "ivf":
# Approximate search with IVF (Inverted File)
quantizer = faiss.IndexFlatL2(self.dimension)
self.index = faiss.IndexIVFFlat(quantizer, self.dimension, 100)
elif index_type == "hnsw":
# Hierarchical Navigable Small World
self.index = faiss.IndexHNSWFlat(self.dimension, 32)
else:
raise ValueError(f"Unknown index type: {index_type}")
print(f"Created {index_type} index with dimension {self.dimension}")
def add_documents(self, documents: List[str], metadata: List[Dict] = None):
"""Add documents to the vector index"""
print(f"Encoding {len(documents)} documents...")
# Generate embeddings
embeddings = self.model.encode(documents, show_progress_bar=True)
embeddings = embeddings.astype('float32')
# Add to index
if self.index.ntotal == 0 and hasattr(self.index, 'train'):
print("Training index...")
self.index.train(embeddings)
self.index.add(embeddings)
# Store documents and metadata
self.documents.extend(documents)
if metadata:
self.metadata.extend(metadata)
else:
self.metadata.extend([{"id": i} for i in range(len(documents))])
print(f"Added {len(documents)} documents. Total: {self.index.ntotal}")
def search(self, query: str, k: int = 5) -> List[Tuple[str, float, Dict]]:
"""Search for similar documents"""
if self.index is None or self.index.ntotal == 0:
return []
# Encode query
query_embedding = self.model.encode([query]).astype('float32')
# Search
distances, indices = self.index.search(query_embedding, k)
# Format results
results = []
for i, (distance, idx) in enumerate(zip(distances[0], indices[0])):
if idx < len(self.documents):
results.append((
self.documents[idx],
float(distance),
self.metadata[idx]
))
return results
def save_index(self, index_name: str):
"""Save index and metadata to Archil disk"""
index_path = os.path.join(INDICES_PATH, f"{index_name}.index")
metadata_path = os.path.join(INDICES_PATH, f"{index_name}_metadata.pkl")
documents_path = os.path.join(INDICES_PATH, f"{index_name}_documents.pkl")
# Save FAISS index
faiss.write_index(self.index, index_path)
# Save metadata and documents
with open(metadata_path, 'wb') as f:
pickle.dump(self.metadata, f)
with open(documents_path, 'wb') as f:
pickle.dump(self.documents, f)
print(f"Index saved to {index_path}")
def load_index(self, index_name: str):
"""Load index and metadata from Archil disk"""
index_path = os.path.join(INDICES_PATH, f"{index_name}.index")
metadata_path = os.path.join(INDICES_PATH, f"{index_name}_metadata.pkl")
documents_path = os.path.join(INDICES_PATH, f"{index_name}_documents.pkl")
if not os.path.exists(index_path):
raise FileNotFoundError(f"Index not found: {index_path}")
# Load FAISS index
self.index = faiss.read_index(index_path)
# Load metadata and documents
with open(metadata_path, 'rb') as f:
self.metadata = pickle.load(f)
with open(documents_path, 'rb') as f:
self.documents = pickle.load(f)
print(f"Loaded index with {self.index.ntotal} vectors")
def create_sample_dataset():
"""Create a sample dataset for demonstration"""
documents = [
"Machine learning is a subset of artificial intelligence",
"Deep learning uses neural networks with multiple layers",
"Natural language processing helps computers understand text",
"Computer vision enables machines to interpret visual information",
"Reinforcement learning trains agents through rewards and penalties",
"Supervised learning uses labeled data for training",
"Unsupervised learning finds patterns in unlabeled data",
"Transfer learning adapts pre-trained models to new tasks",
"Feature engineering improves model performance",
"Cross-validation helps evaluate model generalization"
]
metadata = [
{"category": "AI", "topic": "ML Basics", "id": i}
for i in range(len(documents))
]
return documents, metadata
if __name__ == "__main__":
# Initialize vector search system
vs = ArchilVectorSearch()
# Create sample dataset
documents, metadata = create_sample_dataset()
# Create and populate index
vs.create_index("flat")
vs.add_documents(documents, metadata)
# Save to Archil disk
vs.save_index("ml_concepts")
# Demonstrate search
queries = [
"neural networks and deep learning",
"training with labeled examples",
"computer understanding of images"
]
for query in queries:
print(f"\nQuery: {query}")
results = vs.search(query, k=3)
for i, (doc, distance, meta) in enumerate(results, 1):
print(f"{i}. Distance: {distance:.4f}")
print(f" Document: {doc}")
print(f" Metadata: {meta}")