Contents

#!pip install sentence-transformers scikit-learn pandas
#!pip install --upgrade ipywidgets
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the model
model = SentenceTransformer('all-mpnet-base-v2')

# Your industry description
industry_text = "Nganh nghe buon ban du lieu"

# Texts from different websites
website_texts = [
    "Chung toi buon ban du lieu va cong nghe",
    "Ben cung cap cong nghe",
    "Hien tai chung toi dang phat trien he thong",
    # Add more website texts
]

# Generate embeddings
industry_embedding = model.encode(industry_text)
website_embeddings = model.encode(website_texts)

# Compute similarities
similarities = cosine_similarity([industry_embedding], website_embeddings)[0]

# Find the best match
best_match_index = np.argmax(similarities)
best_match_score = similarities[best_match_index]

print(f"Best matching website is at index {best_match_index} with a similarity score of {best_match_score:.4f}")
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[2], line 1
----> 1 from sentence_transformers import SentenceTransformer
      2 from sklearn.metrics.pairwise import cosine_similarity
      3 import numpy as np

ModuleNotFoundError: No module named 'sentence_transformers'
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Initialize Multilingual SBERT model
sbert_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
#!pip install underthesea
from underthesea import word_tokenize

def preprocess_text(text):
    # Tokenize Vietnamese text
    return ' '.join(word_tokenize(text))

# Your industry description
industry_text = "Nganh nghe buon ban du lieu"

# Texts from different websites
website_texts = [
    "Chung toi buon ban du lieu va cong nghe",
    "Ben cung cap cong nghe",
    "Hien tai chung toi dang phat trien he thong",
    # Thêm nhiều nội dung trang web nếu cần
]

# Apply preprocessing
industry_text = preprocess_text(industry_text)
website_texts = [preprocess_text(text) for text in website_texts]
# Generate embeddings
industry_embedding_sbert = sbert_model.encode(industry_text)
website_embeddings_sbert = sbert_model.encode(website_texts)
# Compute cosine similarities
similarities_sbert = cosine_similarity([industry_embedding_sbert], website_embeddings_sbert)[0]
# LaBSE (Language-agnostic BERT Sentence Embedding)
from sentence_transformers import SentenceTransformer

# Initialize LaBSE model
labse_model = SentenceTransformer('sentence-transformers/LaBSE')
# Generate embeddings
industry_embedding_labse = labse_model.encode(industry_text)
website_embeddings_labse = labse_model.encode(website_texts)
# Compute cosine similarities
similarities_labse = cosine_similarity([industry_embedding_labse], website_embeddings_labse)[0]
#!pip install scikit-learn pandas underthesea
# TF-IDF with Cosine Similarity (Customized for Vietnamese)
from underthesea import word_tokenize

def preprocess_text(text):
    # Tokenize Vietnamese text
    return ' '.join(word_tokenize(text))

# Apply preprocessing
industry_text = preprocess_text(industry_text)
website_texts = [preprocess_text(text) for text in website_texts]


from sklearn.feature_extraction.text import TfidfVectorizer

# Combine all texts for fitting the vectorizer
all_texts = [industry_text] + website_texts

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')  # You can customize stop words for Vietnamese
tfidf_matrix = vectorizer.fit_transform(all_texts)

# Industry vector is the first vector
industry_vector_tfidf = tfidf_matrix[0]

# Website vectors
website_vectors_tfidf = tfidf_matrix[1:]

# Compute cosine similarities
similarities_tfidf = cosine_similarity(industry_vector_tfidf, website_vectors_tfidf)[0]
#!pip install tensorflow tensorflow-hub sentence-transformers scikit-learn pandas underthesea
!pip install sentencepiece
#!pip install transformers torch scikit-learn pandas underthesea
# PhoBERT (Vietnamese-Specific BERT Model)
from transformers import AutoTokenizer, AutoModel
import torch

# Initialize PhoBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
phobert_model = AutoModel.from_pretrained("vinai/phobert-base")

from underthesea import word_tokenize

def preprocess_text(text):
    # Tokenize Vietnamese text
    return ' '.join(word_tokenize(text))

# Apply preprocessing
industry_text = preprocess_text(industry_text)
website_texts = [preprocess_text(text) for text in website_texts]

def get_phobert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = phobert_model(**inputs)
        # Use the [CLS] token representation
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return cls_embedding

# Generate embeddings
industry_embedding_phobert = get_phobert_embedding(industry_text)
website_embeddings_phobert = [get_phobert_embedding(text) for text in website_texts]

# Compute cosine similarities
similarities_phobert = cosine_similarity([industry_embedding_phobert], website_embeddings_phobert)[0]
# Comparing the Methods for Vietnamese
                     
# Create a DataFrame to store similarity scores
df_results = pd.DataFrame({
    'Website': [f"Website {i+1}" for i in range(len(website_texts))],
    'SBERT Similarity': similarities_sbert,
    'LaBSE Similarity': similarities_labse,
    'TF-IDF Similarity': similarities_tfidf,
    'PhoBERT Similarity': similarities_phobert
})

# Display the DataFrame
print(df_results)
# Find the best match for each method
best_sbert = df_results.loc[df_results['SBERT Similarity'].idxmax()]
best_labse = df_results.loc[df_results['LaBSE Similarity'].idxmax()]
best_tfidf = df_results.loc[df_results['TF-IDF Similarity'].idxmax()]
best_phobert = df_results.loc[df_results['PhoBERT Similarity'].idxmax()]

print("\nBest Matches per Method:")
print(f"SBERT: {best_sbert['Website']} with similarity {best_sbert['SBERT Similarity']:.4f}")
print(f"LaBSE: {best_labse['Website']} with similarity {best_labse['LaBSE Similarity']:.4f}")
print(f"TF-IDF: {best_tfidf['Website']} with similarity {best_tfidf['TF-IDF Similarity']:.4f}")
print(f"PhoBERT: {best_phobert['Website']} with similarity {best_phobert['PhoBERT Similarity']:.4f}")

# Find the best match index for each method
best_matches = {
    'SBERT': df_results['SBERT Similarity'].idxmax(),
    'LaBSE': df_results['LaBSE Similarity'].idxmax(),
    'TF-IDF': df_results['TF-IDF Similarity'].idxmax(),
    'PhoBERT': df_results['PhoBERT Similarity'].idxmax()
}

# Map indices to website names
best_matches = {method: df_results.loc[idx, 'Website'] for method, idx in best_matches.items()}

# Count occurrences
consensus = pd.Series(list(best_matches.values())).value_counts()

print("\nConsensus on Best Match:")
print(consensus)