Data Engineering with Avishkar: 2025

May 14, 2025

Fashion Catalog Similarity Search using Datastax AstraDB Vector Database

DataStax Astra DB's vector database capabilities can be leveraged to build an efficient fashion catalog similarity search, enabling users to find visually similar clothing items based on uploaded catalog descriptions.

Product descriptions and other relevant text associated with the fashion items (e.g., tags, categories) can be converted into vector embeddings using a text embedding model (like SentenceTransformers).

from cassandra.cluster import Cluster

from cassandra.auth import PlainTextAuthProvider

# Read environment variables

secure_connect_bundle_path = os.getenv('ASTRA_DB_SECURE_CONNECT_BUNDLE')

application_token = os.getenv('ASTRA_DB_TOKEN')

# Setup authentication provider

auth_provider = PlainTextAuthProvider('token', application_token)

# Connect to the Cassandra database using the secure connect bundle

cluster = Cluster(

cloud={"secure_connect_bundle": secure_connect_bundle_path},

auth_provider=auth_provider

)

session = cluster.connect()

# Define keyspace

keyspace = "catalog"

v_dimension = 5

# Set the keyspace

session.set_keyspace(keyspace)

# Verify connection by querying the system.local table

rows = session.execute("SELECT release_version FROM system.local")

for row in rows:

print(f"Connected to Cassandra, release version: {row.release_version}")

# Print the current keyspace

current_keyspace = session.execute("SELECT keyspace_name FROM system_schema.keyspaces WHERE keyspace_name = %s", [keyspace])

for row in current_keyspace:

print(f"Connected to keyspace: {row.keyspace_name}")

print("Connected to AstraDB and keyspace successfully!")

session.execute((

"CREATE TABLE IF NOT EXISTS {keyspace}.ProductDescVectors (ProductId INT PRIMARY KEY, ProductDesc TEXT, ImageURL text, CatalogVector VECTOR<FLOAT,{v_dimension}>);"

).format(keyspace=keyspace, v_dimension=v_dimension))

session.execute((

"CREATE CUSTOM INDEX IF NOT EXISTS idx_ProductDescVectors "

"ON {keyspace}.ProductDescVectors "

"(CatalogVector) USING 'StorageAttachedIndex' WITH OPTIONS = "

"{{'similarity_function' : 'cosine'}};"

).format(keyspace=keyspace))

text_blocks = [

(1, "United colors of Benetton Men White Boxer Trunks","UndercolorsofBenetton-Men-White-Boxer_b4ef04538840c0020e4829ecc042ead1_images.jpg", [-0.0711570307612419, 0.0490173473954201, -0.0348679609596729, -0.0208837632089853, 0.0250527486205101]

(2, "Turtle Men Check Red Shirt","Turtle-Men-Check-Red-Shirt_4982b2b1a76a85a85c9adc8b4b2d523a_images.jpg" ,[-0.0678209140896797, 0.0918413251638412, 0.0087888557463884, -0.0005505480221473, 0.0586152337491512]),

(3, "United Colors of Benetton Men White Check Shirt","United-Colors-of-Benetton-Men-White-Check-Shirt_13cfaff26872c298112a8e7da15c1e1d_images.jpg" ,[-0.0697127357125282, 0.0486216545104980, -0.0169006455689669, -0.0160229168832302, 0.0137890130281448]

(4, "United Colors of Benetton Men Check White Shirts","UnitedColorsofBenetton-Men-Check-White-Shirts_5bd8cae4fc61052a6f00cfcd69c4a936_images.jpg" ,[-0.0499644242227077, 0.0566278323531151, -0.0294290613383055, -0.0070271748118103, 0.0289674568921328]

(5, "Wrangler-Men-Broad-Blue-Shirt","Wrangler-Men-Broad-Blue-Shirt_8211520250143786-1.jpg" ,[-0.0581886917352676, 0.0378338471055031, 0.0425588376820087, -0.0423909239470959, 0.0186673272401094]

)

]

for block in text_blocks:

id, text, text,vector = block

session.execute(

f"INSERT INTO {keyspace}.ProductDescVectors(ProductId, ProductDesc, ImageURL,CatalogVector) VALUES (%s, %s,%s, %s)",

(id, text, text,vector)

)

Performing Catalog Similarity Search:

User Enters Text Description: If a user provides a text description, generate its vector embedding using the appropriate text embedding model like llama_index.embeddings.huggingface

Vector Search in Astra DB: Use Astra DB's vector search functionality to find the most similar embeddings in your database to the user's query embedding.

ann_query = (

f"SELECT ProductDesc, ImageURL,similarity_cosine(CatalogVector, [0.15, 0.1, 0.1, 0.35, 0.55]) as similarity FROM {keyspace}.ProductDescVectors "

"ORDER BY ProductImageVector ANN OF [0.15, 0.1, 0.1, 0.35, 0.55] LIMIT 2"

)

for row in session.execute(ann_query):

print(f"[{row.productdesc}\" (sim: {row.similarity:.4f})")

# Print success message

print("Catalogs with semantic match.")

Example scenario:

A user wants more fashion products similar to "United Colors of Benetton Men Check White Shirts".

The application:

Generates a vector embedding of the "United Colors of Benetton Men Check White Shirts" text.

Performs a vector search in Astra DB to find fashion items with similar catalog(text) embeddings.

Retrieves and displays the details of the most similar catalog, including similar brand,styles, colors, and potentially complementary items like shoes or accessories.

Retrieve Matching Items: Astra DB returns the IDs of the fashion items corresponding to the most similar embeddings.

Display Results: Retrieve the full details (images, descriptions, etc.) of the matching fashion items from Astra DB and display them to the user.

ann_query_matching = (

f"SELECT ProductDesc, ImageURL,similarity_cosine(CatalogVector, [-0.0499644242227077, 0.0566278323531151, -0.0294290613383055, -0.0070271748118103, 0.0289674568921328]) as similarity FROM {keyspace}.ProductDescVectors "

"ORDER BY CatalogVector ANN OF [-0.0499644242227077, 0.0566278323531151, -0.0294290613383055, -0.0070271748118103, 0.0289674568921328] LIMIT 2"

)

for row in session.execute(ann_query_matching):

print(f"[{row.productdesc}\" (sim: {row.similarity:.4f})")

print("Data with similar match.")

Benefits of using Astra DB for fashion similarity search:

Scalability and Performance: Astra DB is designed for high-throughput, low-latency operations, making it suitable for large fashion catalogs and real-time search.

Simplified Architecture: Astra DB can store both your product data and vector embeddings, eliminating the need for separate vector databases.

Cost-Effectiveness: Astra DB offers serverless options, reducing the need for infrastructure management and associated costs.

Integration with GenAI Applications: Astra DB works seamlessly with tools like LangChain for building generative AI applications, potentially enabling features like fashion recommendations, chatbots, and more.

May 10, 2025

SQL Server Online Rebuild Index Bug

What do you do if you have a very large table and you can’t take an outage window to rebuild the clustered index offline?

· Exclude the clustered index from your index maintenance jobs

· Configure it to only have REORGANIZE operations, not REBUILDS

· Rebuild Index Online

As Microsoft’s documentation says:

We recommend performing online index operations for business environments that operate 24 hours a day, seven days a week, in which the need for concurrent user activity during index operations is vital.

How online index operations work

To allow for concurrent user activity during an index data definition language (DDL) operation, the following structures are used during the online index operation: source and preexisting indexes, target, and for rebuilding a heap or dropping a clustered index online, a temporary mapping index.

· During an online index operation, such as creating a clustered index on a nonindexed table (heap), the source and target go through three phases: preparation, build, and final.

· The source is the original table or clustered index data.

· An online index rebuild starts against an index

· The preexisting indexes are available to concurrent users for select, insert, update, and delete operations.

· The target or target is the new index (or heap) or a set of new indexes that is being created or rebuilt. Users insert, update, and delete operations to the source are applied by the Database Engine to the target during the index operation. The target index isn't used until the index operation is committed. Internally, the index is marked as write-only.

· A modification query runs before the index rebuilds query. At the end of the index rebuild, it tries to get a schema modification lock. It can’t, because the modification query is still running

· The index rebuild’s super-high-level lock request then causes a massive pile-up behind it– blocking even queries that want to do dirty reads, even NOLOCK can be blocked

The following illustration shows the process for creating an initial clustered index online. The source object (the heap) has no other indexes. The source and target structure activities are shown for each phase; concurrent user SELECT, INSERT, UPDATE, and DELETE operations are also shown. The preparation, build, and final phases are indicated together with the lock modes used in each phase.

Diagram showing the activities performed during online index operation.

Side-Effects of online Rebuild index

· Blocking -. In order to complete Online Rebuild Index operation, SQL server need a very high level of lock: a schema modification lock (SCH-M) which results in blocking scenarios.

· Offline Rebuild Index -SQL Server Online Index Rebuild sometimes happens offline without warning and it blocks both read and write queries against the table for long periods.

Workaround to avoid blocking

ALTER INDEX [INDEX_NAME]

ON [TABLENAME]

REBUILD WITH ( FILLFACTOR = 80,ONLINE = ON

(WAIT_AT_LOW_PRIORITY (MAX_DURATION = 5 MINUTES, ABORT_AFTER_WAIT = SELF)));

WAIT_AT_LOW_PRIORITY – is the new setting

MAX_DURATION – is used to specify the maximum time in minutes to wait until an action

ABORT_AFTER_WAIT – is used to abort the operation if it exceeds the wait time

NONE – is to wait for the lock with regular priority
SELF – exits the online index operation
BLOCKERS – kills transactions blocking the index rebuild

There’s no pileup! “LOW_PRIORITY” really does mean low priority – the “NOLOCK” query is able to get a shared schema lock and move forward.

SQL Code:

CREATE TABLE [dbo].[IndexMaintenanceLog](

[IndexName] [varchar](200) NULL,

[TableName] [varchar](200) NULL,

[CommandText] [varchar](1500) NULL,

[Status] [varchar](200) NULL,

[ErrorMessage] [varchar](1500) NULL,

[CreatedON] [datetime] NULL

) ON [PRIMARY]

SET ANSI_NULLS ON

SET QUOTED_IDENTIFIER ON

CREATE PROCEDURE [dbo].[ IndexOptimize]

@UseResumable BIT = 0,

@UseLowPriorityWait BIT = 1

BEGIN

SET NOCOUNT ON;

DECLARE @sql NVARCHAR(MAX), @IndexName SYSNAME, @TableName SYSNAME, @SchemaName SYSNAME, @Frag FLOAT, @IndexTypeDesc NVARCHAR(60);

SELECT

s.name AS SchemaName,

t.name AS TableName,

i.name AS IndexName,

ips.avg_fragmentation_in_percent AS Fragmentation,

i.type_desc AS IndexTypeDesc

INTO #IndexStats

FROM sys.dm_db_index_physical_stats(DB_ID(), NULL, NULL, NULL, 'LIMITED') AS ips

INNER JOIN sys.indexes i ON ips.object_id = i.object_id AND ips.index_id = i.index_id

INNER JOIN sys.tables t ON i.object_id = t.object_id

INNER JOIN sys.schemas s ON t.schema_id = s.schema_id

AND i.name IS NOT NULL

AND ips.page_count > 100 -- Skip small indexes

AND ips.index_level = 0 -- Only leaf level

ORDER BY ips.avg_fragmentation_in_percent DESC;

DECLARE index_cursor CURSOR FOR

SELECT SchemaName, TableName, IndexName, Fragmentation, IndexTypeDesc FROM #IndexStats;

OPEN index_cursor;

FETCH NEXT FROM index_cursor INTO @SchemaName, @TableName, @IndexName, @Frag, @IndexTypeDesc;

WHILE @@FETCH_STATUS = 0

BEGIN

BEGIN TRY

IF @Frag >= 30

BEGIN

SET @sql = N'ALTER INDEX [' + @IndexName + '] ON [' + @SchemaName + '].[' + @TableName + '] REBUILD WITH ( FILLFACTOR = 80,ONLINE = ON ('

IF @UseLowPriorityWait = 1

SET @sql += 'WAIT_AT_LOW_PRIORITY (MAX_DURATION = 5 MINUTES, ABORT_AFTER_WAIT = SELF)';

IF @UseResumable = 1

SET @sql += ', RESUMABLE = ON, MAX_DURATION = 15 MINUTES';

SET @sql += '));';

END

ELSE IF @Frag >= 10

BEGIN

SET @sql = N'ALTER INDEX [' + @IndexName + '] ON [' + @SchemaName + '].[' + @TableName + '] REORGANIZE;';

END

ELSE

BEGIN

FETCH NEXT FROM index_cursor INTO @SchemaName, @TableName, @IndexName, @Frag, @IndexTypeDesc;

CONTINUE;

END

PRINT @sql

EXEC sp_executesql @sql;

INSERT INTO IndexMaintenanceLog (IndexName, TableName, CommandText, Status,CreatedOn)

VALUES (@IndexName, @TableName, @sql, 'Success',getdate());

END TRY

BEGIN CATCH

INSERT INTO IndexMaintenanceLog (IndexName, TableName, CommandText, Status, ErrorMessage,CreatedOn)

VALUES (@IndexName, @TableName, @sql, 'Failure', ERROR_MESSAGE(),getdate());

END CATCH;

FETCH NEXT FROM index_cursor INTO @SchemaName, @TableName, @IndexName, @Frag, @IndexTypeDesc;

END

CLOSE index_cursor;

DEALLOCATE index_cursor;

SET NOCOUNT OFF;

END

https://learn.microsoft.com/en-us/sql/relational-databases/indexes/how-online-index-operations-work?view=sql-server-ver16

https://www.brentozar.com/archive/2015/01/testing-alter-index-rebuild-wait_at_low_priority-sql-server-2014/

May 01, 2025

Image Similarity Search using Vector embeddings and Cosine similarity

Image embeddings capture visual features (shapes, colors, objects, textures)

Image embedding models are typically CNN or vision transformer based

Image Embedding Generation:

Instead of embedding text, we embed images using a pre-trained vision model
Popular models include Vision Transformers (ViT), ResNet, EfficientNet, or CLIP
The model extracts features from images and converts them to dense vector representations

Core Similarities with Text Search:

Both methods convert unstructured data (text/images) into vector representations
Both use similarity metrics (typically cosine similarity) to find closest matches
Both can be stored in vector databases like AstraDB for efficient retrieval

Applications:

Product recommendations (visually similar products)
Reverse image search
Finding duplicate or near-duplicate images
Content moderation (finding similar inappropriate content)
Medical image analysis (finding similar cases)

Python code -

import os

import pandas as pd

import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

from PIL import Image

import torch

from transformers import AutoImageProcessor, AutoModel

# Set device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize image embedding model - using ViT (Vision Transformer)

model_name = "google/vit-base-patch16-224"

processor = AutoImageProcessor.from_pretrained(model_name)

model = AutoModel.from_pretrained(model_name).to(device)

# Function to get image embedding

def get_image_embedding(image_path):

image = Image.open(image_path)

inputs = processor(images=image, return_tensors="pt").to(device)

with torch.no_grad():

outputs = model(**inputs)

# Use CLS token as the image embedding

embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy().flatten()

return embedding

# List of image paths

image_paths = [

"Indigo Nation Men Plain Orange Shirts.jpg",

"ADIDAS Men Navy Blue Shirts.jpg",

"Indigo Nation Men Price catch Blue Shirts .jpg",

"Puma Men's Foundation Grey Polo T-shirt.jpg",

"Indigo Nation Men Bling Pink Shirts.jpg"

]

# Query image

query_image = "Indigo Nation Men Plain Orange Shirts.jpg"

# Get embedding for the query image

query_embedding = get_image_embedding(query_image)

# Store all embeddings and their corresponding images

all_embeddings = []

for image_path in image_paths:

embedding = get_image_embedding(image_path)

all_embeddings.append(embedding)

# Convert to numpy arrays for similarity calculation

query_embedding_np = np.array(query_embedding).reshape(1, -1)

all_embeddings_np = np.array(all_embeddings)

# Calculate cosine similarity between query and all images

similarities = cosine_similarity(query_embedding_np, all_embeddings_np).flatten()

# Create a DataFrame to display results

results = pd.DataFrame({

'Image': image_paths,

'Similarity Score': similarities

})

# Sort by similarity score in descending order

results = results.sort_values('Similarity Score', ascending=False)

print(f"Query image: {query_image}")

print("\nSimilarity Search Results:")

print(results)

# Find the most similar image

most_similar_idx = np.argmax(similarities)

print(f"\nMost similar image: \"{image_paths[most_similar_idx]}\" with similarity score: {similarities[most_similar_idx]:.4f}")