May 14, 2025

Fashion Catalog Similarity Search using Datastax AstraDB Vector Database

DataStax Astra DB's vector database capabilities can be leveraged to build an efficient fashion catalog similarity search, enabling users to find visually similar clothing items based on uploaded catalog descriptions. 


Product descriptions and other relevant text associated with the fashion items (e.g., tags, categories) can be converted into vector embeddings using a text embedding model (like SentenceTransformers).


from cassandra.cluster import Cluster

from cassandra.auth import PlainTextAuthProvider


# Read environment variables

secure_connect_bundle_path = os.getenv('ASTRA_DB_SECURE_CONNECT_BUNDLE')

application_token = os.getenv('ASTRA_DB_TOKEN')


# Setup authentication provider

auth_provider = PlainTextAuthProvider('token', application_token)


# Connect to the Cassandra database using the secure connect bundle

cluster = Cluster(

    cloud={"secure_connect_bundle": secure_connect_bundle_path},

    auth_provider=auth_provider

)

session = cluster.connect()


# Define keyspace

keyspace = "catalog"

v_dimension = 5


# Set the keyspace

session.set_keyspace(keyspace)


# Verify connection by querying the system.local table

rows = session.execute("SELECT release_version FROM system.local")

for row in rows:

    print(f"Connected to Cassandra, release version: {row.release_version}")


# Print the current keyspace

current_keyspace = session.execute("SELECT keyspace_name FROM system_schema.keyspaces WHERE keyspace_name = %s", [keyspace])

for row in current_keyspace:

    print(f"Connected to keyspace: {row.keyspace_name}")


print("Connected to AstraDB and keyspace successfully!")


session.execute((

    "CREATE TABLE IF NOT EXISTS {keyspace}.ProductDescVectors (ProductId INT PRIMARY KEY, ProductDesc TEXT, ImageURL text, CatalogVector VECTOR<FLOAT,{v_dimension}>);"

).format(keyspace=keyspace, v_dimension=v_dimension))


session.execute((

    "CREATE CUSTOM INDEX IF NOT EXISTS idx_ProductDescVectors "

    "ON {keyspace}.ProductDescVectors "

    "(CatalogVector) USING 'StorageAttachedIndex' WITH OPTIONS = "

    "{{'similarity_function' : 'cosine'}};"

).format(keyspace=keyspace))


text_blocks = [

    (1, "United colors of Benetton Men White Boxer Trunks","UndercolorsofBenetton-Men-White-Boxer_b4ef04538840c0020e4829ecc042ead1_images.jpg", [-0.0711570307612419, 0.0490173473954201, -0.0348679609596729, -0.0208837632089853, 0.0250527486205101]

),

    (2, "Turtle Men Check Red Shirt","Turtle-Men-Check-Red-Shirt_4982b2b1a76a85a85c9adc8b4b2d523a_images.jpg" ,[-0.0678209140896797, 0.0918413251638412, 0.0087888557463884, -0.0005505480221473, 0.0586152337491512]),

    (3, "United Colors of Benetton Men White Check Shirt","United-Colors-of-Benetton-Men-White-Check-Shirt_13cfaff26872c298112a8e7da15c1e1d_images.jpg" ,[-0.0697127357125282, 0.0486216545104980, -0.0169006455689669, -0.0160229168832302, 0.0137890130281448]

),

 (4, "United Colors of Benetton Men Check White Shirts","UnitedColorsofBenetton-Men-Check-White-Shirts_5bd8cae4fc61052a6f00cfcd69c4a936_images.jpg" ,[-0.0499644242227077, 0.0566278323531151, -0.0294290613383055, -0.0070271748118103, 0.0289674568921328]

),

    (5, "Wrangler-Men-Broad-Blue-Shirt","Wrangler-Men-Broad-Blue-Shirt_8211520250143786-1.jpg" ,[-0.0581886917352676, 0.0378338471055031, 0.0425588376820087, -0.0423909239470959, 0.0186673272401094]

)

]

for block in text_blocks:

    id, text, text,vector = block

    session.execute(

        f"INSERT INTO {keyspace}.ProductDescVectors(ProductId, ProductDesc, ImageURL,CatalogVector) VALUES (%s, %s,%s, %s)",

        (id, text, text,vector)

    )


/*

Performing Catalog Similarity Search:

User Enters Text Description: If a user provides a text description, generate its vector embedding using the appropriate text embedding model like llama_index.embeddings.huggingface

Vector Search in Astra DB: Use Astra DB's vector search functionality to find the most similar embeddings in your database to the user's query embedding.

*/

ann_query = (

    f"SELECT  ProductDesc, ImageURL,similarity_cosine(CatalogVector, [0.15, 0.1, 0.1, 0.35, 0.55]) as similarity FROM {keyspace}.ProductDescVectors "

    "ORDER BY ProductImageVector ANN OF [0.15, 0.1, 0.1, 0.35, 0.55] LIMIT 2"

)

for row in session.execute(ann_query):

    print(f"[{row.productdesc}\" (sim: {row.similarity:.4f})")


    # Print success message

    

print("Catalogs with semantic match.")

/*


Example scenario:

A user wants more fashion products similar to "United Colors of Benetton Men Check White Shirts". 

The application:

Generates a vector embedding of the "United Colors of Benetton Men Check White Shirts" text.

Performs a vector search in Astra DB to find fashion items with similar catalog(text) embeddings.

Retrieves and displays the details of the most similar catalog, including similar brand,styles, colors, and potentially complementary items like shoes or accessories. 


Retrieve Matching Items: Astra DB returns the IDs of the fashion items corresponding to the most similar embeddings.


Display Results: Retrieve the full details (images, descriptions, etc.) of the matching fashion items from Astra DB and display them to the user. 

*/

ann_query_matching = (

    f"SELECT  ProductDesc, ImageURL,similarity_cosine(CatalogVector, [-0.0499644242227077, 0.0566278323531151, -0.0294290613383055, -0.0070271748118103, 0.0289674568921328]) as similarity FROM {keyspace}.ProductDescVectors "

    "ORDER BY CatalogVector ANN OF [-0.0499644242227077, 0.0566278323531151, -0.0294290613383055, -0.0070271748118103, 0.0289674568921328] LIMIT 2"

)

for row in session.execute(ann_query_matching):

    print(f"[{row.productdesc}\" (sim: {row.similarity:.4f})")

    

print("Data with similar match.")


Benefits of using Astra DB for fashion similarity search:

Scalability and Performance: Astra DB is designed for high-throughput, low-latency operations, making it suitable for large fashion catalogs and real-time search.

Simplified Architecture: Astra DB can store both your product data and vector embeddings, eliminating the need for separate vector databases.

Cost-Effectiveness: Astra DB offers serverless options, reducing the need for infrastructure management and associated costs.

Integration with GenAI Applications: Astra DB works seamlessly with tools like LangChain for building generative AI applications, potentially enabling features like fashion recommendations, chatbots, and more. 


May 10, 2025

SQL Server Online Rebuild Index Bug

 

What do you do if you have a very large table and you can’t take an outage window to rebuild the clustered index offline?

·         Exclude the clustered index from your index maintenance jobs

·         Configure it to only have REORGANIZE operations, not REBUILDS

·         Rebuild Index Online

As Microsoft’s documentation says:

We recommend performing online index operations for business environments that operate 24 hours a day, seven days a week, in which the need for concurrent user activity during index operations is vital.

 

How online index operations work

To allow for concurrent user activity during an index data definition language (DDL) operation, the following structures are used during the online index operation: source and preexisting indexes, target, and for rebuilding a heap or dropping a clustered index online, a temporary mapping index.

·         During an online index operation, such as creating a clustered index on a nonindexed table (heap), the source and target go through three phases: preparation, build, and final.

·         The source is the original table or clustered index data.

·         An online index rebuild starts against an index

·         The preexisting indexes are available to concurrent users for select, insert, update, and delete operations.

·         The target or target is the new index (or heap) or a set of new indexes that is being created or rebuilt. Users insert, update, and delete operations to the source are applied by the Database Engine to the target during the index operation. The target index isn't used until the index operation is committed. Internally, the index is marked as write-only.

·         A modification query runs before the index rebuilds query. At the end of the index rebuild, it tries to get a schema modification lock. It can’t, because the modification query is still running

·         The index rebuild’s super-high-level lock request then causes a massive pile-up behind it– blocking even queries that want to do dirty reads, even NOLOCK can be blocked

The following illustration shows the process for creating an initial clustered index online. The source object (the heap) has no other indexes. The source and target structure activities are shown for each phase; concurrent user SELECTINSERTUPDATE, and DELETE operations are also shown. The preparation, build, and final phases are indicated together with the lock modes used in each phase.

 

                                                      

Diagram showing the activities performed during online index operation.

 

Side-Effects of  online Rebuild  index

·         Blocking -. In order to complete Online Rebuild Index operation, SQL server  need a very high level of lock: a schema modification lock (SCH-M) which results in blocking scenarios.

·         Offline Rebuild Index -SQL Server Online Index Rebuild sometimes happens offline without warning and it blocks both read and write queries against the table for long periods.

 

Workaround to avoid blocking

ALTER INDEX [INDEX_NAME]

ON [TABLENAME]

REBUILD WITH ( FILLFACTOR = 80,ONLINE = ON

 (WAIT_AT_LOW_PRIORITY (MAX_DURATION = 5 MINUTES, ABORT_AFTER_WAIT = SELF)));

  • WAIT_AT_LOW_PRIORITY – is the new setting
    • MAX_DURATION – is used to specify the maximum time in minutes to wait until an action
    • ABORT_AFTER_WAIT – is used to abort the operation if it exceeds the wait time
      • NONE – is to wait for the lock with regular priority
      • SELF – exits the online index operation
      • BLOCKERS – kills transactions blocking the index rebuild

There’s no pileup! “LOW_PRIORITY” really does mean low priority – the “NOLOCK” query is able to get a shared schema lock and move forward.

 

SQL Code:

CREATE TABLE [dbo].[IndexMaintenanceLog](

                [IndexName] [varchar](200) NULL,

                [TableName] [varchar](200) NULL,

                [CommandText] [varchar](1500) NULL,

                [Status] [varchar](200) NULL,

                [ErrorMessage] [varchar](1500) NULL,

                [CreatedON] [datetime] NULL

) ON [PRIMARY]

GO


SET ANSI_NULLS ON

GO

SET QUOTED_IDENTIFIER ON

GO

CREATE PROCEDURE [dbo].[ IndexOptimize]

     @UseResumable BIT = 0,

    @UseLowPriorityWait BIT = 1

 

AS

BEGIN

    SET NOCOUNT ON;

               

    DECLARE @sql NVARCHAR(MAX), @IndexName SYSNAME, @TableName SYSNAME, @SchemaName SYSNAME, @Frag FLOAT, @IndexTypeDesc NVARCHAR(60);

 

    SELECT

        s.name AS SchemaName,

        t.name AS TableName,

        i.name AS IndexName,

        ips.avg_fragmentation_in_percent AS Fragmentation,

        i.type_desc AS IndexTypeDesc

    INTO #IndexStats

    FROM sys.dm_db_index_physical_stats(DB_ID(), NULL, NULL, NULL, 'LIMITED') AS ips

    INNER JOIN sys.indexes i ON ips.object_id = i.object_id AND ips.index_id = i.index_id

    INNER JOIN sys.tables t ON i.object_id = t.object_id

    INNER JOIN sys.schemas s ON t.schema_id = s.schema_id

   

    AND i.name IS NOT NULL

    AND ips.page_count > 100 -- Skip small indexes

    AND ips.index_level = 0 -- Only leaf level

    ORDER BY ips.avg_fragmentation_in_percent DESC;

 

    DECLARE index_cursor CURSOR FOR

    SELECT SchemaName, TableName, IndexName, Fragmentation, IndexTypeDesc FROM #IndexStats;

 

    OPEN index_cursor;

 

    FETCH NEXT FROM index_cursor INTO @SchemaName, @TableName, @IndexName, @Frag, @IndexTypeDesc;

 

    WHILE @@FETCH_STATUS = 0

    BEGIN

        BEGIN TRY

            IF @Frag >= 30

            BEGIN

                SET @sql = N'ALTER INDEX [' + @IndexName + '] ON [' + @SchemaName + '].[' + @TableName + '] REBUILD WITH ( FILLFACTOR = 80,ONLINE = ON ('

 

                IF @UseLowPriorityWait = 1

                    SET @sql += 'WAIT_AT_LOW_PRIORITY (MAX_DURATION = 5 MINUTES, ABORT_AFTER_WAIT = SELF)';

 

                IF @UseResumable = 1

                    SET @sql += ', RESUMABLE = ON, MAX_DURATION = 15 MINUTES';

 

                SET @sql += '));';

            END

            ELSE IF @Frag >= 10

            BEGIN

                SET @sql = N'ALTER INDEX [' + @IndexName + '] ON [' + @SchemaName + '].[' + @TableName + '] REORGANIZE;';

            END

            ELSE

            BEGIN

                FETCH NEXT FROM index_cursor INTO @SchemaName, @TableName, @IndexName, @Frag, @IndexTypeDesc;

                CONTINUE;

            END

 

                                                PRINT @sql

            EXEC sp_executesql @sql;

 

            INSERT INTO IndexMaintenanceLog (IndexName, TableName, CommandText, Status,CreatedOn)

            VALUES (@IndexName, @TableName, @sql, 'Success',getdate());

        END TRY

        BEGIN CATCH

            INSERT INTO IndexMaintenanceLog (IndexName, TableName, CommandText, Status, ErrorMessage,CreatedOn)

            VALUES (@IndexName, @TableName, @sql, 'Failure', ERROR_MESSAGE(),getdate());

        END CATCH;

 

        FETCH NEXT FROM index_cursor INTO @SchemaName, @TableName, @IndexName, @Frag, @IndexTypeDesc;

    END

 

    CLOSE index_cursor;

    DEALLOCATE index_cursor;

 

    SET NOCOUNT OFF;

END

 

https://learn.microsoft.com/en-us/sql/relational-databases/indexes/how-online-index-operations-work?view=sql-server-ver16

 

https://www.brentozar.com/archive/2015/01/testing-alter-index-rebuild-wait_at_low_priority-sql-server-2014/

 

May 01, 2025

Image Similarity Search using Vector embeddings and Cosine similarity

 Image embeddings capture visual features (shapes, colors, objects, textures)

Image embedding models are typically CNN or vision transformer based

Image Embedding Generation:

  • Instead of embedding text, we embed images using a pre-trained vision model
  • Popular models include Vision Transformers (ViT), ResNet, EfficientNet, or CLIP
  • The model extracts features from images and converts them to dense vector representations

Core Similarities with Text Search:

  • Both methods convert unstructured data (text/images) into vector representations
  • Both use similarity metrics (typically cosine similarity) to find closest matches
  • Both can be stored in vector databases like AstraDB for efficient retrieval
Applications:

  • Product recommendations (visually similar products)
  • Reverse image search
  • Finding duplicate or near-duplicate images
  • Content moderation (finding similar inappropriate content)
  • Medical image analysis (finding similar cases)

Python code -

import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image
import torch
from transformers import AutoImageProcessor, AutoModel

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize image embedding model - using ViT (Vision Transformer)
model_name = "google/vit-base-patch16-224"
processor = AutoImageProcessor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

# Function to get image embedding
def get_image_embedding(image_path):
    image = Image.open(image_path)
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use CLS token as the image embedding
    embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()
    return embedding

# List of image paths
image_paths = [
    "Indigo Nation Men Plain Orange Shirts.jpg",
    "ADIDAS Men Navy Blue Shirts.jpg",
    "Indigo Nation Men Price catch Blue Shirts .jpg",
    "Puma Men's Foundation Grey Polo T-shirt.jpg",
    "Indigo Nation Men  Bling Pink Shirts.jpg"
]

# Query image
query_image = "Indigo Nation Men Plain Orange Shirts.jpg"

# Get embedding for the query image
query_embedding = get_image_embedding(query_image)

# Store all embeddings and their corresponding images
all_embeddings = []
for image_path in image_paths:
    embedding = get_image_embedding(image_path)
    all_embeddings.append(embedding)

# Convert to numpy arrays for similarity calculation
query_embedding_np = np.array(query_embedding).reshape(1, -1)
all_embeddings_np = np.array(all_embeddings)

# Calculate cosine similarity between query and all images
similarities = cosine_similarity(query_embedding_np, all_embeddings_np).flatten()

# Create a DataFrame to display results
results = pd.DataFrame({
    'Image': image_paths,
    'Similarity Score': similarities
})

# Sort by similarity score in descending order
results = results.sort_values('Similarity Score', ascending=False)

print(f"Query image: {query_image}")
print("\nSimilarity Search Results:")
print(results)

# Find the most similar image
most_similar_idx = np.argmax(similarities)
print(f"\nMost similar image: \"{image_paths[most_similar_idx]}\" with similarity score: {similarities[most_similar_idx]:.4f}")


Fashion Catalog Similarity Search using Datastax AstraDB Vector Database

DataStax Astra DB's vector database capabilities can be leveraged to build an efficient fashion catalog similarity search, enabling user...