# -*- coding:utf-8 -*- import config import re import numpy as np def merge_stem(a_stem:list, b_stem:list): """ 合并词根 """ return list(set(a_stem).union(set(b_stem))) def gen_word_vec(a_word:str, b_word:str, stem:list): """ 生成词向量 """ a_vec, b_vec = [], [] for word in stem: # if re.findall(word, config.RE_SPECIAL_SIMBOL): if word in config.RE_SPECIAL_SIMBOL: word = "\\" + word if word == "c++": word = "c\\+\\+" a_vec.append(len(re.findall(word, a_word))) b_vec.append(len(re.findall(word, b_word))) return a_vec, b_vec def col_sim(vec1, vec2): """ 计算余弦相似性 """ return vec1.dot(vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) def cal_cos_sim(a_word:str, a_stem:list, b_word:str, b_stem:list): """ 计算余弦相似性 """ union_stem = merge_stem(a_stem, b_stem) a_vec, b_vec = gen_word_vec(a_word, b_word, union_stem) val = col_sim(np.array(a_vec), np.array(b_vec)) return val