importpandasaspddescripciones=['All users must reset passwords every 90 days.','Passwords need to be reset by all users every 90 days.','Admin access should be restricted.','Passwords must change for users every 90 days.','Passwords must change for users every 80 days.']# Cargar el dataset data=pd.DataFrame({'Rule_ID':range(1,len(descripciones)+1),'Description':descripciones})
Lexical similarity
fromsklearn.feature_extraction.textimportTfidfVectorizerfromsklearn.metrics.pairwiseimportcosine_similarity!# Vectorización de las descripciones con TF-IDF vectorizer=TfidfVectorizer().fit_transform(data['Description'])# Calcular la matriz de similitud de coseno cosine_sim_matrix=cosine_similarity(vectorizer)# Crear un diccionario para almacenar las relaciones sin duplicados deffind_related_rules(matrix,rule_ids,threshold=0.8):related_rules={}seen_pairs=set()# Para evitar duplicados de la forma (A, B) = (B, A) foriinrange(len(matrix)):related=[]forjinrange(i+1,len(matrix)):# j comienza en i + 1 para evitar duplicados ifmatrix[i,j]>=threshold:pair=(rule_ids[i],rule_ids[j])ifpairnotinseen_pairs:seen_pairs.add(pair)related.append((rule_ids[j],round(matrix[i,j],2)))ifrelated:related_rules[rule_ids[i]]=relatedreturnrelated_rules# Aplicar la función para encontrar reglas relacionadas related_rules=find_related_rules(cosine_sim_matrix,data['Rule_ID'].tolist(),threshold=0.8)# Mostrar las reglas relacionadas print("Reglas relacionadas por similitud:")forrule,relationsinrelated_rules.items():print(f"Rule {rule} es similar a:")forrelated_rule,scoreinrelations:print(f" - Rule {related_rule} con similitud de {score}")
Semantical similarity
!pipinstallsentence-transformersfromsentence_transformersimportSentenceTransformer,util# Load the pre-trained model for generating embeddings model=SentenceTransformer('all-MiniLM-L6-v2')# Generate sentence embeddings for each rule description embeddings=model.encode(data['Description'],convert_to_tensor=True)# Compute the semantic similarity matrix cosine_sim_matrix=util.cos_sim(embeddings,embeddings).cpu().numpy()# Function to find related rules based on semantic similarity deffind_related_rules(matrix,rule_ids,threshold=0.8):related_rules={}seen_pairs=set()# To avoid duplicates of the form (A, B) = (B, A) foriinrange(len(matrix)):related=[]forjinrange(i+1,len(matrix)):# Only consider upper triangular matrix ifmatrix[i,j]>=threshold:pair=(rule_ids[i],rule_ids[j])ifpairnotinseen_pairs:seen_pairs.add(pair)related.append((rule_ids[j],round(matrix[i,j],2)))ifrelated:related_rules[rule_ids[i]]=relatedreturnrelated_rules# Apply the function to find related rules related_rules=find_related_rules(cosine_sim_matrix,data['Rule_ID'].tolist(),threshold=0.8)# Display the related rules print("Reglas relacionadas por similitud semántica:")forrule,relationsinrelated_rules.items():print(f"Rule {rule} es similar a:")forrelated_rule,scoreinrelations:print(f" - Rule {related_rule} con similitud de {score}")
Top comments (0)
Subscribe
For further actions, you may consider blocking this person and/or reporting abuse
We're a place where coders share, stay up-to-date and grow their careers.
Top comments (0)