Contents

# !pip install sentence-transformers transformers torch scikit-learn pandas
# Sentence-BERT (SBERT)
from sentence_transformers import SentenceTransformer
import numpy as np

# Initialize SBERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')  # A lightweight and efficient model
# Your industry description
industry_text = "industry data warehouse and data for businesses"

# Texts from different websites
website_texts = [
    "We are the best business data seller in the country",
    "We can give you the best warehouse for you in the country",
    "Look no further for your warehouse needs",
    # Add more website texts
]

# Generate embeddings
industry_embedding_sbert = sbert_model.encode(industry_text)
website_embeddings_sbert = sbert_model.encode(website_texts)
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarities
similarities_sbert = cosine_similarity([industry_embedding_sbert], website_embeddings_sbert)[0]
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[2], line 2
      1 # Sentence-BERT (SBERT)
----> 2 from sentence_transformers import SentenceTransformer
      3 import numpy as np
      5 # Initialize SBERT model

ModuleNotFoundError: No module named 'sentence_transformers'
# LaBSE (Language-agnostic BERT Sentence Embedding)
from sentence_transformers import SentenceTransformer

# Initialize LaBSE model
labse_model = SentenceTransformer('sentence-transformers/LaBSE')
# Generate embeddings
industry_embedding_labse = labse_model.encode(industry_text)
website_embeddings_labse = labse_model.encode(website_texts)
# Compute similarities
similarities_labse = cosine_similarity([industry_embedding_labse], website_embeddings_labse)[0]
# TF-IDF with Cosine Similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine all texts for fitting the vectorizer
all_texts = [industry_text] + website_texts

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(all_texts)
# Industry vector is the first vector
industry_vector_tfidf = tfidf_matrix[0]

# Website vectors
website_vectors_tfidf = tfidf_matrix[1:]
# Compute cosine similarities
similarities_tfidf = cosine_similarity(industry_vector_tfidf, website_vectors_tfidf)[0]
# Comparing the Methods
import pandas as pd

# Create a DataFrame to store similarity scores
df_results = pd.DataFrame({
    'Website': [f"Website {i+1}" for i in range(len(website_texts))],
    'SBERT Similarity': similarities_sbert,
    'LaBSE Similarity': similarities_labse,
    'TF-IDF Similarity': similarities_tfidf
})

# Display the DataFrame
print(df_results)
# Find the best match for each method
best_sbert = df_results.loc[df_results['SBERT Similarity'].idxmax()]
best_labse = df_results.loc[df_results['LaBSE Similarity'].idxmax()]
best_tfidf = df_results.loc[df_results['TF-IDF Similarity'].idxmax()]

print("\nBest Matches per Method:")
print(f"SBERT: {best_sbert['Website']} with similarity {best_sbert['SBERT Similarity']:.4f}")
print(f"LaBSE: {best_labse['Website']} with similarity {best_labse['LaBSE Similarity']:.4f}")
print(f"TF-IDF: {best_tfidf['Website']} with similarity {best_tfidf['TF-IDF Similarity']:.4f}")
# Count the number of methods that selected each website as the best match
best_matches = df_results[['SBERT Similarity', 'LaBSE Similarity', 'TF-IDF Similarity']].idxmax()

# Map index to website names
best_matches = best_matches.map(lambda x: df_results.loc[x, 'Website'])

# Count occurrences
consensus = best_matches.value_counts()

print("\nConsensus on Best Match:")
print(consensus)