This code shows how to find semantically similar text using vector embeddings and cosine similarity.
HuggingFaceEmbedding from llama_index is imported to generate text embeddings
BAAI/bge-small-en-v1.5 - lightweight embedding model used to convert each text in the weather_descs list to vector embeddings.
Cosine similarity helps identify texts with similar meaning by measuring how "aligned" their vector representations are
Python Code -
import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# Set TensorFlow environment variable to suppress warnings
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
# Initialize embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# List of Texts
weather_descs = [
"The weather cold today",
"Today is Monday",
"Today is first Sunday of winter",
"Today is Sunday"
]
# Query text
query = "It is freezing today"
# Get embedding for the query
query_embedding = embed_model.get_text_embedding(query)
# Store all embeddings and their corresponding texts
all_embeddings = []
for weather_desc in weather_descs:
# Get embeddings for the current text
embedding = embed_model.get_text_embedding(weather_desc)
all_embeddings.append(embedding)
# Convert to numpy arrays for similarity calculation
query_embedding_np = np.array(query_embedding).reshape(1, -1)
all_embeddings_np = np.array(all_embeddings)
# Calculate cosine similarity between query and all texts
similarities = cosine_similarity(query_embedding_np, all_embeddings_np).flatten()
# Create a DataFrame to display results
results = pd.DataFrame({
'Text': weather_descs,
'Similarity Score': similarities
})
# Sort by similarity score in descending order
results = results.sort_values('Similarity Score', ascending=False)
print("Query:", query)
print("\nSimilarity Search Results:")
print(results)
# Find the most similar text
most_similar_idx = np.argmax(similarities)
print(f"\nMost similar text: \"{weather_descs[most_similar_idx]}\" with similarity score: {similarities[most_similar_idx]:.4f}")
Output :
No comments:
Post a Comment