#!pip install sentence-transformers scikit-learn pandas
#!pip install --upgrade ipywidgets
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# Initialize the model
model = SentenceTransformer('all-mpnet-base-v2')
# Your industry description
industry_text = "Nganh nghe buon ban du lieu"
# Texts from different websites
website_texts = [
"Chung toi buon ban du lieu va cong nghe",
"Ben cung cap cong nghe",
"Hien tai chung toi dang phat trien he thong",
# Add more website texts
]
# Generate embeddings
industry_embedding = model.encode(industry_text)
website_embeddings = model.encode(website_texts)
# Compute similarities
similarities = cosine_similarity([industry_embedding], website_embeddings)[0]
# Find the best match
best_match_index = np.argmax(similarities)
best_match_score = similarities[best_match_index]
print(f"Best matching website is at index {best_match_index} with a similarity score of {best_match_score:.4f}")
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[2], line 1
----> 1 from sentence_transformers import SentenceTransformer
2 from sklearn.metrics.pairwise import cosine_similarity
3 import numpy as np
ModuleNotFoundError: No module named 'sentence_transformers'
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
# Initialize Multilingual SBERT model
sbert_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
#!pip install underthesea
from underthesea import word_tokenize
def preprocess_text(text):
# Tokenize Vietnamese text
return ' '.join(word_tokenize(text))
# Your industry description
industry_text = "Nganh nghe buon ban du lieu"
# Texts from different websites
website_texts = [
"Chung toi buon ban du lieu va cong nghe",
"Ben cung cap cong nghe",
"Hien tai chung toi dang phat trien he thong",
# Thêm nhiều nội dung trang web nếu cần
]
# Apply preprocessing
industry_text = preprocess_text(industry_text)
website_texts = [preprocess_text(text) for text in website_texts]
# Generate embeddings
industry_embedding_sbert = sbert_model.encode(industry_text)
website_embeddings_sbert = sbert_model.encode(website_texts)
# Compute cosine similarities
similarities_sbert = cosine_similarity([industry_embedding_sbert], website_embeddings_sbert)[0]
# LaBSE (Language-agnostic BERT Sentence Embedding)
from sentence_transformers import SentenceTransformer
# Initialize LaBSE model
labse_model = SentenceTransformer('sentence-transformers/LaBSE')
# Generate embeddings
industry_embedding_labse = labse_model.encode(industry_text)
website_embeddings_labse = labse_model.encode(website_texts)
# Compute cosine similarities
similarities_labse = cosine_similarity([industry_embedding_labse], website_embeddings_labse)[0]
#!pip install scikit-learn pandas underthesea
# TF-IDF with Cosine Similarity (Customized for Vietnamese)
from underthesea import word_tokenize
def preprocess_text(text):
# Tokenize Vietnamese text
return ' '.join(word_tokenize(text))
# Apply preprocessing
industry_text = preprocess_text(industry_text)
website_texts = [preprocess_text(text) for text in website_texts]
from sklearn.feature_extraction.text import TfidfVectorizer
# Combine all texts for fitting the vectorizer
all_texts = [industry_text] + website_texts
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english') # You can customize stop words for Vietnamese
tfidf_matrix = vectorizer.fit_transform(all_texts)
# Industry vector is the first vector
industry_vector_tfidf = tfidf_matrix[0]
# Website vectors
website_vectors_tfidf = tfidf_matrix[1:]
# Compute cosine similarities
similarities_tfidf = cosine_similarity(industry_vector_tfidf, website_vectors_tfidf)[0]
#!pip install tensorflow tensorflow-hub sentence-transformers scikit-learn pandas underthesea
!pip install sentencepiece
#!pip install transformers torch scikit-learn pandas underthesea
# PhoBERT (Vietnamese-Specific BERT Model)
from transformers import AutoTokenizer, AutoModel
import torch
# Initialize PhoBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
phobert_model = AutoModel.from_pretrained("vinai/phobert-base")
from underthesea import word_tokenize
def preprocess_text(text):
# Tokenize Vietnamese text
return ' '.join(word_tokenize(text))
# Apply preprocessing
industry_text = preprocess_text(industry_text)
website_texts = [preprocess_text(text) for text in website_texts]
def get_phobert_embedding(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
outputs = phobert_model(**inputs)
# Use the [CLS] token representation
cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
return cls_embedding
# Generate embeddings
industry_embedding_phobert = get_phobert_embedding(industry_text)
website_embeddings_phobert = [get_phobert_embedding(text) for text in website_texts]
# Compute cosine similarities
similarities_phobert = cosine_similarity([industry_embedding_phobert], website_embeddings_phobert)[0]
# Comparing the Methods for Vietnamese
# Create a DataFrame to store similarity scores
df_results = pd.DataFrame({
'Website': [f"Website {i+1}" for i in range(len(website_texts))],
'SBERT Similarity': similarities_sbert,
'LaBSE Similarity': similarities_labse,
'TF-IDF Similarity': similarities_tfidf,
'PhoBERT Similarity': similarities_phobert
})
# Display the DataFrame
print(df_results)
# Find the best match for each method
best_sbert = df_results.loc[df_results['SBERT Similarity'].idxmax()]
best_labse = df_results.loc[df_results['LaBSE Similarity'].idxmax()]
best_tfidf = df_results.loc[df_results['TF-IDF Similarity'].idxmax()]
best_phobert = df_results.loc[df_results['PhoBERT Similarity'].idxmax()]
print("\nBest Matches per Method:")
print(f"SBERT: {best_sbert['Website']} with similarity {best_sbert['SBERT Similarity']:.4f}")
print(f"LaBSE: {best_labse['Website']} with similarity {best_labse['LaBSE Similarity']:.4f}")
print(f"TF-IDF: {best_tfidf['Website']} with similarity {best_tfidf['TF-IDF Similarity']:.4f}")
print(f"PhoBERT: {best_phobert['Website']} with similarity {best_phobert['PhoBERT Similarity']:.4f}")
# Find the best match index for each method
best_matches = {
'SBERT': df_results['SBERT Similarity'].idxmax(),
'LaBSE': df_results['LaBSE Similarity'].idxmax(),
'TF-IDF': df_results['TF-IDF Similarity'].idxmax(),
'PhoBERT': df_results['PhoBERT Similarity'].idxmax()
}
# Map indices to website names
best_matches = {method: df_results.loc[idx, 'Website'] for method, idx in best_matches.items()}
# Count occurrences
consensus = pd.Series(list(best_matches.values())).value_counts()
print("\nConsensus on Best Match:")
print(consensus)